LLVM 22.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(true),
64
66 : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
67 RI(ST), ST(ST) {
68 SchedModel.init(&ST);
69}
70
71//===----------------------------------------------------------------------===//
72// TargetInstrInfo callbacks
73//===----------------------------------------------------------------------===//
74
75static unsigned getNumOperandsNoGlue(SDNode *Node) {
76 unsigned N = Node->getNumOperands();
77 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
78 --N;
79 return N;
80}
81
82/// Returns true if both nodes have the same value for the given
83/// operand \p Op, or if both nodes do not have this operand.
85 AMDGPU::OpName OpName) {
86 unsigned Opc0 = N0->getMachineOpcode();
87 unsigned Opc1 = N1->getMachineOpcode();
88
89 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
90 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
91
92 if (Op0Idx == -1 && Op1Idx == -1)
93 return true;
94
95
96 if ((Op0Idx == -1 && Op1Idx != -1) ||
97 (Op1Idx == -1 && Op0Idx != -1))
98 return false;
99
100 // getNamedOperandIdx returns the index for the MachineInstr's operands,
101 // which includes the result as the first operand. We are indexing into the
102 // MachineSDNode's operands, so we need to skip the result operand to get
103 // the real index.
104 --Op0Idx;
105 --Op1Idx;
106
107 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
108}
109
110static bool canRemat(const MachineInstr &MI) {
111
115 return true;
116
117 if (SIInstrInfo::isSMRD(MI)) {
118 return !MI.memoperands_empty() &&
119 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
120 return MMO->isLoad() && MMO->isInvariant();
121 });
122 }
123
124 return false;
125}
126
128 const MachineInstr &MI) const {
129
130 if (canRemat(MI)) {
131 // Normally VALU use of exec would block the rematerialization, but that
132 // is OK in this case to have an implicit exec read as all VALU do.
133 // We really want all of the generic logic for this except for this.
134
135 // Another potential implicit use is mode register. The core logic of
136 // the RA will not attempt rematerialization if mode is set anywhere
137 // in the function, otherwise it is safe since mode is not changed.
138
139 // There is difference to generic method which does not allow
140 // rematerialization if there are virtual register uses. We allow this,
141 // therefore this method includes SOP instructions as well.
142 if (!MI.hasImplicitDef() &&
143 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
144 !MI.mayRaiseFPException())
145 return true;
146 }
147
149}
150
151// Returns true if the scalar result of a VALU instruction depends on exec.
152bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
153 // Ignore comparisons which are only used masked with exec.
154 // This allows some hoisting/sinking of VALU comparisons.
155 if (MI.isCompare()) {
156 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
157 if (!Dst)
158 return true;
159
160 Register DstReg = Dst->getReg();
161 if (!DstReg.isVirtual())
162 return true;
163
164 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
165 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
166 switch (Use.getOpcode()) {
167 case AMDGPU::S_AND_SAVEEXEC_B32:
168 case AMDGPU::S_AND_SAVEEXEC_B64:
169 break;
170 case AMDGPU::S_AND_B32:
171 case AMDGPU::S_AND_B64:
172 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
173 return true;
174 break;
175 default:
176 return true;
177 }
178 }
179 return false;
180 }
181
182 switch (MI.getOpcode()) {
183 default:
184 break;
185 case AMDGPU::V_READFIRSTLANE_B32:
186 return true;
187 }
188
189 return false;
190}
191
193 // Any implicit use of exec by VALU is not a real register read.
194 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
195 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
196}
197
199 MachineBasicBlock *SuccToSinkTo,
200 MachineCycleInfo *CI) const {
201 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
202 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
203 return true;
204
205 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
206 // Check if sinking of MI would create temporal divergent use.
207 for (auto Op : MI.uses()) {
208 if (Op.isReg() && Op.getReg().isVirtual() &&
209 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
210 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
211
212 // SgprDef defined inside cycle
213 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
214 if (FromCycle == nullptr)
215 continue;
216
217 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
218 // Check if there is a FromCycle that contains SgprDef's basic block but
219 // does not contain SuccToSinkTo and also has divergent exit condition.
220 while (FromCycle && !FromCycle->contains(ToCycle)) {
222 FromCycle->getExitingBlocks(ExitingBlocks);
223
224 // FromCycle has divergent exit condition.
225 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
226 if (hasDivergentBranch(ExitingBlock))
227 return false;
228 }
229
230 FromCycle = FromCycle->getParentCycle();
231 }
232 }
233 }
234
235 return true;
236}
237
239 int64_t &Offset0,
240 int64_t &Offset1) const {
241 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
242 return false;
243
244 unsigned Opc0 = Load0->getMachineOpcode();
245 unsigned Opc1 = Load1->getMachineOpcode();
246
247 // Make sure both are actually loads.
248 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
249 return false;
250
251 // A mayLoad instruction without a def is not a load. Likely a prefetch.
252 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
253 return false;
254
255 if (isDS(Opc0) && isDS(Opc1)) {
256
257 // FIXME: Handle this case:
258 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
259 return false;
260
261 // Check base reg.
262 if (Load0->getOperand(0) != Load1->getOperand(0))
263 return false;
264
265 // Skip read2 / write2 variants for simplicity.
266 // TODO: We should report true if the used offsets are adjacent (excluded
267 // st64 versions).
268 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
269 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
270 if (Offset0Idx == -1 || Offset1Idx == -1)
271 return false;
272
273 // XXX - be careful of dataless loads
274 // getNamedOperandIdx returns the index for MachineInstrs. Since they
275 // include the output in the operand list, but SDNodes don't, we need to
276 // subtract the index by one.
277 Offset0Idx -= get(Opc0).NumDefs;
278 Offset1Idx -= get(Opc1).NumDefs;
279 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
280 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
281 return true;
282 }
283
284 if (isSMRD(Opc0) && isSMRD(Opc1)) {
285 // Skip time and cache invalidation instructions.
286 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
287 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
288 return false;
289
290 unsigned NumOps = getNumOperandsNoGlue(Load0);
291 if (NumOps != getNumOperandsNoGlue(Load1))
292 return false;
293
294 // Check base reg.
295 if (Load0->getOperand(0) != Load1->getOperand(0))
296 return false;
297
298 // Match register offsets, if both register and immediate offsets present.
299 assert(NumOps == 4 || NumOps == 5);
300 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
301 return false;
302
303 const ConstantSDNode *Load0Offset =
305 const ConstantSDNode *Load1Offset =
307
308 if (!Load0Offset || !Load1Offset)
309 return false;
310
311 Offset0 = Load0Offset->getZExtValue();
312 Offset1 = Load1Offset->getZExtValue();
313 return true;
314 }
315
316 // MUBUF and MTBUF can access the same addresses.
317 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
318
319 // MUBUF and MTBUF have vaddr at different indices.
320 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
321 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
322 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
323 return false;
324
325 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
326 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
327
328 if (OffIdx0 == -1 || OffIdx1 == -1)
329 return false;
330
331 // getNamedOperandIdx returns the index for MachineInstrs. Since they
332 // include the output in the operand list, but SDNodes don't, we need to
333 // subtract the index by one.
334 OffIdx0 -= get(Opc0).NumDefs;
335 OffIdx1 -= get(Opc1).NumDefs;
336
337 SDValue Off0 = Load0->getOperand(OffIdx0);
338 SDValue Off1 = Load1->getOperand(OffIdx1);
339
340 // The offset might be a FrameIndexSDNode.
341 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
342 return false;
343
344 Offset0 = Off0->getAsZExtVal();
345 Offset1 = Off1->getAsZExtVal();
346 return true;
347 }
348
349 return false;
350}
351
352static bool isStride64(unsigned Opc) {
353 switch (Opc) {
354 case AMDGPU::DS_READ2ST64_B32:
355 case AMDGPU::DS_READ2ST64_B64:
356 case AMDGPU::DS_WRITE2ST64_B32:
357 case AMDGPU::DS_WRITE2ST64_B64:
358 return true;
359 default:
360 return false;
361 }
362}
363
366 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
367 const TargetRegisterInfo *TRI) const {
368 if (!LdSt.mayLoadOrStore())
369 return false;
370
371 unsigned Opc = LdSt.getOpcode();
372 OffsetIsScalable = false;
373 const MachineOperand *BaseOp, *OffsetOp;
374 int DataOpIdx;
375
376 if (isDS(LdSt)) {
377 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
378 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
379 if (OffsetOp) {
380 // Normal, single offset LDS instruction.
381 if (!BaseOp) {
382 // DS_CONSUME/DS_APPEND use M0 for the base address.
383 // TODO: find the implicit use operand for M0 and use that as BaseOp?
384 return false;
385 }
386 BaseOps.push_back(BaseOp);
387 Offset = OffsetOp->getImm();
388 // Get appropriate operand, and compute width accordingly.
389 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
390 if (DataOpIdx == -1)
391 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
392 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
393 Width = LocationSize::precise(64);
394 else
395 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
396 } else {
397 // The 2 offset instructions use offset0 and offset1 instead. We can treat
398 // these as a load with a single offset if the 2 offsets are consecutive.
399 // We will use this for some partially aligned loads.
400 const MachineOperand *Offset0Op =
401 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
402 const MachineOperand *Offset1Op =
403 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
404
405 unsigned Offset0 = Offset0Op->getImm() & 0xff;
406 unsigned Offset1 = Offset1Op->getImm() & 0xff;
407 if (Offset0 + 1 != Offset1)
408 return false;
409
410 // Each of these offsets is in element sized units, so we need to convert
411 // to bytes of the individual reads.
412
413 unsigned EltSize;
414 if (LdSt.mayLoad())
415 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
416 else {
417 assert(LdSt.mayStore());
418 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
419 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
420 }
421
422 if (isStride64(Opc))
423 EltSize *= 64;
424
425 BaseOps.push_back(BaseOp);
426 Offset = EltSize * Offset0;
427 // Get appropriate operand(s), and compute width accordingly.
428 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
429 if (DataOpIdx == -1) {
430 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
431 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
432 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
433 Width = LocationSize::precise(
434 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
435 } else {
436 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
437 }
438 }
439 return true;
440 }
441
442 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
443 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
444 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
445 return false;
446 BaseOps.push_back(RSrc);
447 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
448 if (BaseOp && !BaseOp->isFI())
449 BaseOps.push_back(BaseOp);
450 const MachineOperand *OffsetImm =
451 getNamedOperand(LdSt, AMDGPU::OpName::offset);
452 Offset = OffsetImm->getImm();
453 const MachineOperand *SOffset =
454 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
455 if (SOffset) {
456 if (SOffset->isReg())
457 BaseOps.push_back(SOffset);
458 else
459 Offset += SOffset->getImm();
460 }
461 // Get appropriate operand, and compute width accordingly.
462 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
463 if (DataOpIdx == -1)
464 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
465 if (DataOpIdx == -1) // LDS DMA
466 return false;
467 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
468 return true;
469 }
470
471 if (isImage(LdSt)) {
472 auto RsrcOpName =
473 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
474 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
475 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
476 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
477 if (VAddr0Idx >= 0) {
478 // GFX10 possible NSA encoding.
479 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
480 BaseOps.push_back(&LdSt.getOperand(I));
481 } else {
482 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
483 }
484 Offset = 0;
485 // Get appropriate operand, and compute width accordingly.
486 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
487 if (DataOpIdx == -1)
488 return false; // no return sampler
489 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
490 return true;
491 }
492
493 if (isSMRD(LdSt)) {
494 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
495 if (!BaseOp) // e.g. S_MEMTIME
496 return false;
497 BaseOps.push_back(BaseOp);
498 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
499 Offset = OffsetOp ? OffsetOp->getImm() : 0;
500 // Get appropriate operand, and compute width accordingly.
501 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
502 if (DataOpIdx == -1)
503 return false;
504 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
505 return true;
506 }
507
508 if (isFLAT(LdSt)) {
509 // Instructions have either vaddr or saddr or both or none.
510 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
511 if (BaseOp)
512 BaseOps.push_back(BaseOp);
513 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
514 if (BaseOp)
515 BaseOps.push_back(BaseOp);
516 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
517 // Get appropriate operand, and compute width accordingly.
518 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
519 if (DataOpIdx == -1)
520 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
521 if (DataOpIdx == -1) // LDS DMA
522 return false;
523 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
524 return true;
525 }
526
527 return false;
528}
529
530static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
532 const MachineInstr &MI2,
534 // Only examine the first "base" operand of each instruction, on the
535 // assumption that it represents the real base address of the memory access.
536 // Other operands are typically offsets or indices from this base address.
537 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
538 return true;
539
540 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
541 return false;
542
543 auto *MO1 = *MI1.memoperands_begin();
544 auto *MO2 = *MI2.memoperands_begin();
545 if (MO1->getAddrSpace() != MO2->getAddrSpace())
546 return false;
547
548 const auto *Base1 = MO1->getValue();
549 const auto *Base2 = MO2->getValue();
550 if (!Base1 || !Base2)
551 return false;
552 Base1 = getUnderlyingObject(Base1);
553 Base2 = getUnderlyingObject(Base2);
554
555 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
556 return false;
557
558 return Base1 == Base2;
559}
560
562 int64_t Offset1, bool OffsetIsScalable1,
564 int64_t Offset2, bool OffsetIsScalable2,
565 unsigned ClusterSize,
566 unsigned NumBytes) const {
567 // If the mem ops (to be clustered) do not have the same base ptr, then they
568 // should not be clustered
569 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
570 if (!BaseOps1.empty() && !BaseOps2.empty()) {
571 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
572 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
573 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
574 return false;
575
576 const SIMachineFunctionInfo *MFI =
577 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
578 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
579 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
580 // If only one base op is empty, they do not have the same base ptr
581 return false;
582 }
583
584 // In order to avoid register pressure, on an average, the number of DWORDS
585 // loaded together by all clustered mem ops should not exceed
586 // MaxMemoryClusterDWords. This is an empirical value based on certain
587 // observations and performance related experiments.
588 // The good thing about this heuristic is - it avoids clustering of too many
589 // sub-word loads, and also avoids clustering of wide loads. Below is the
590 // brief summary of how the heuristic behaves for various `LoadSize` when
591 // MaxMemoryClusterDWords is 8.
592 //
593 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
594 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
595 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
596 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
597 // (5) LoadSize >= 17: do not cluster
598 const unsigned LoadSize = NumBytes / ClusterSize;
599 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
600 return NumDWords <= MaxMemoryClusterDWords;
601}
602
603// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
604// the first 16 loads will be interleaved with the stores, and the next 16 will
605// be clustered as expected. It should really split into 2 16 store batches.
606//
607// Loads are clustered until this returns false, rather than trying to schedule
608// groups of stores. This also means we have to deal with saying different
609// address space loads should be clustered, and ones which might cause bank
610// conflicts.
611//
612// This might be deprecated so it might not be worth that much effort to fix.
614 int64_t Offset0, int64_t Offset1,
615 unsigned NumLoads) const {
616 assert(Offset1 > Offset0 &&
617 "Second offset should be larger than first offset!");
618 // If we have less than 16 loads in a row, and the offsets are within 64
619 // bytes, then schedule together.
620
621 // A cacheline is 64 bytes (for global memory).
622 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
623}
624
627 const DebugLoc &DL, MCRegister DestReg,
628 MCRegister SrcReg, bool KillSrc,
629 const char *Msg = "illegal VGPR to SGPR copy") {
630 MachineFunction *MF = MBB.getParent();
631
633 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
634
635 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
636 .addReg(SrcReg, getKillRegState(KillSrc));
637}
638
639/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
640/// possible to have a direct copy in these cases on GFX908, so an intermediate
641/// VGPR copy is required.
645 const DebugLoc &DL, MCRegister DestReg,
646 MCRegister SrcReg, bool KillSrc,
647 RegScavenger &RS, bool RegsOverlap,
648 Register ImpDefSuperReg = Register(),
649 Register ImpUseSuperReg = Register()) {
650 assert((TII.getSubtarget().hasMAIInsts() &&
651 !TII.getSubtarget().hasGFX90AInsts()) &&
652 "Expected GFX908 subtarget.");
653
654 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
655 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
656 "Source register of the copy should be either an SGPR or an AGPR.");
657
658 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
659 "Destination register of the copy should be an AGPR.");
660
661 const SIRegisterInfo &RI = TII.getRegisterInfo();
662
663 // First try to find defining accvgpr_write to avoid temporary registers.
664 // In the case of copies of overlapping AGPRs, we conservatively do not
665 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
666 // an accvgpr_write used for this same copy due to implicit-defs
667 if (!RegsOverlap) {
668 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
669 --Def;
670
671 if (!Def->modifiesRegister(SrcReg, &RI))
672 continue;
673
674 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
675 Def->getOperand(0).getReg() != SrcReg)
676 break;
677
678 MachineOperand &DefOp = Def->getOperand(1);
679 assert(DefOp.isReg() || DefOp.isImm());
680
681 if (DefOp.isReg()) {
682 bool SafeToPropagate = true;
683 // Check that register source operand is not clobbered before MI.
684 // Immediate operands are always safe to propagate.
685 for (auto I = Def; I != MI && SafeToPropagate; ++I)
686 if (I->modifiesRegister(DefOp.getReg(), &RI))
687 SafeToPropagate = false;
688
689 if (!SafeToPropagate)
690 break;
691
692 for (auto I = Def; I != MI; ++I)
693 I->clearRegisterKills(DefOp.getReg(), &RI);
694 }
695
696 MachineInstrBuilder Builder =
697 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
698 .add(DefOp);
699 if (ImpDefSuperReg)
700 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
701
702 if (ImpUseSuperReg) {
703 Builder.addReg(ImpUseSuperReg,
705 }
706
707 return;
708 }
709 }
710
711 RS.enterBasicBlockEnd(MBB);
712 RS.backward(std::next(MI));
713
714 // Ideally we want to have three registers for a long reg_sequence copy
715 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
716 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
717 *MBB.getParent());
718
719 // Registers in the sequence are allocated contiguously so we can just
720 // use register number to pick one of three round-robin temps.
721 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
722 Register Tmp =
723 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
724 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
725 "VGPR used for an intermediate copy should have been reserved.");
726
727 // Only loop through if there are any free registers left. We don't want to
728 // spill.
729 while (RegNo--) {
730 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
731 /* RestoreAfter */ false, 0,
732 /* AllowSpill */ false);
733 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
734 break;
735 Tmp = Tmp2;
736 RS.setRegUsed(Tmp);
737 }
738
739 // Insert copy to temporary VGPR.
740 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
741 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
742 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
743 } else {
744 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
745 }
746
747 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
748 .addReg(SrcReg, getKillRegState(KillSrc));
749 if (ImpUseSuperReg) {
750 UseBuilder.addReg(ImpUseSuperReg,
752 }
753
754 MachineInstrBuilder DefBuilder
755 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
756 .addReg(Tmp, RegState::Kill);
757
758 if (ImpDefSuperReg)
759 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
760}
761
764 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
765 const TargetRegisterClass *RC, bool Forward) {
766 const SIRegisterInfo &RI = TII.getRegisterInfo();
767 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
769 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
770
771 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
772 int16_t SubIdx = BaseIndices[Idx];
773 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
774 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
775 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
776 unsigned Opcode = AMDGPU::S_MOV_B32;
777
778 // Is SGPR aligned? If so try to combine with next.
779 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
780 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
781 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
782 // Can use SGPR64 copy
783 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
784 SubIdx = RI.getSubRegFromChannel(Channel, 2);
785 DestSubReg = RI.getSubReg(DestReg, SubIdx);
786 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
787 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
788 Opcode = AMDGPU::S_MOV_B64;
789 Idx++;
790 }
791
792 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
793 .addReg(SrcSubReg)
794 .addReg(SrcReg, RegState::Implicit);
795
796 if (!FirstMI)
797 FirstMI = LastMI;
798
799 if (!Forward)
800 I--;
801 }
802
803 assert(FirstMI && LastMI);
804 if (!Forward)
805 std::swap(FirstMI, LastMI);
806
807 FirstMI->addOperand(
808 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
809
810 if (KillSrc)
811 LastMI->addRegisterKilled(SrcReg, &RI);
812}
813
816 const DebugLoc &DL, Register DestReg,
817 Register SrcReg, bool KillSrc, bool RenamableDest,
818 bool RenamableSrc) const {
819 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
820 unsigned Size = RI.getRegSizeInBits(*RC);
821 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
822 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
823
824 // The rest of copyPhysReg assumes Src and Dst size are the same size.
825 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
826 // we remove Fix16BitCopies and this code block?
827 if (Fix16BitCopies) {
828 if (((Size == 16) != (SrcSize == 16))) {
829 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
830 assert(ST.useRealTrue16Insts());
831 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
832 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
833 RegToFix = SubReg;
834
835 if (DestReg == SrcReg) {
836 // Identity copy. Insert empty bundle since ExpandPostRA expects an
837 // instruction here.
838 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
839 return;
840 }
841 RC = RI.getPhysRegBaseClass(DestReg);
842 Size = RI.getRegSizeInBits(*RC);
843 SrcRC = RI.getPhysRegBaseClass(SrcReg);
844 SrcSize = RI.getRegSizeInBits(*SrcRC);
845 }
846 }
847
848 if (RC == &AMDGPU::VGPR_32RegClass) {
849 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
850 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
851 AMDGPU::AGPR_32RegClass.contains(SrcReg));
852 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
853 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
854 BuildMI(MBB, MI, DL, get(Opc), DestReg)
855 .addReg(SrcReg, getKillRegState(KillSrc));
856 return;
857 }
858
859 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
860 RC == &AMDGPU::SReg_32RegClass) {
861 if (SrcReg == AMDGPU::SCC) {
862 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
863 .addImm(1)
864 .addImm(0);
865 return;
866 }
867
868 if (DestReg == AMDGPU::VCC_LO) {
869 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
870 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
871 .addReg(SrcReg, getKillRegState(KillSrc));
872 } else {
873 // FIXME: Hack until VReg_1 removed.
874 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
875 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
876 .addImm(0)
877 .addReg(SrcReg, getKillRegState(KillSrc));
878 }
879
880 return;
881 }
882
883 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
884 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
885 return;
886 }
887
888 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
889 .addReg(SrcReg, getKillRegState(KillSrc));
890 return;
891 }
892
893 if (RC == &AMDGPU::SReg_64RegClass) {
894 if (SrcReg == AMDGPU::SCC) {
895 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
896 .addImm(1)
897 .addImm(0);
898 return;
899 }
900
901 if (DestReg == AMDGPU::VCC) {
902 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
903 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
904 .addReg(SrcReg, getKillRegState(KillSrc));
905 } else {
906 // FIXME: Hack until VReg_1 removed.
907 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
908 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
909 .addImm(0)
910 .addReg(SrcReg, getKillRegState(KillSrc));
911 }
912
913 return;
914 }
915
916 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
917 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
918 return;
919 }
920
921 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
922 .addReg(SrcReg, getKillRegState(KillSrc));
923 return;
924 }
925
926 if (DestReg == AMDGPU::SCC) {
927 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
928 // but SelectionDAG emits such copies for i1 sources.
929 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
930 // This copy can only be produced by patterns
931 // with explicit SCC, which are known to be enabled
932 // only for subtargets with S_CMP_LG_U64 present.
933 assert(ST.hasScalarCompareEq64());
934 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
935 .addReg(SrcReg, getKillRegState(KillSrc))
936 .addImm(0);
937 } else {
938 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
939 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
940 .addReg(SrcReg, getKillRegState(KillSrc))
941 .addImm(0);
942 }
943
944 return;
945 }
946
947 if (RC == &AMDGPU::AGPR_32RegClass) {
948 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
949 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
950 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
951 .addReg(SrcReg, getKillRegState(KillSrc));
952 return;
953 }
954
955 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
956 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
957 .addReg(SrcReg, getKillRegState(KillSrc));
958 return;
959 }
960
961 // FIXME: Pass should maintain scavenger to avoid scan through the block on
962 // every AGPR spill.
963 RegScavenger RS;
964 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
965 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
966 return;
967 }
968
969 if (Size == 16) {
970 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
971 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
972 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
973
974 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
975 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
976 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
977 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
978 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
979 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
980 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
981 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
982
983 if (IsSGPRDst) {
984 if (!IsSGPRSrc) {
985 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
986 return;
987 }
988
989 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
990 .addReg(NewSrcReg, getKillRegState(KillSrc));
991 return;
992 }
993
994 if (IsAGPRDst || IsAGPRSrc) {
995 if (!DstLow || !SrcLow) {
996 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
997 "Cannot use hi16 subreg with an AGPR!");
998 }
999
1000 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
1001 return;
1002 }
1003
1004 if (ST.useRealTrue16Insts()) {
1005 if (IsSGPRSrc) {
1006 assert(SrcLow);
1007 SrcReg = NewSrcReg;
1008 }
1009 // Use the smaller instruction encoding if possible.
1010 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1011 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1012 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1013 .addReg(SrcReg);
1014 } else {
1015 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1016 .addImm(0) // src0_modifiers
1017 .addReg(SrcReg)
1018 .addImm(0); // op_sel
1019 }
1020 return;
1021 }
1022
1023 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1024 if (!DstLow || !SrcLow) {
1025 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1026 "Cannot use hi16 subreg on VI!");
1027 }
1028
1029 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1030 .addReg(NewSrcReg, getKillRegState(KillSrc));
1031 return;
1032 }
1033
1034 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1035 .addImm(0) // src0_modifiers
1036 .addReg(NewSrcReg)
1037 .addImm(0) // clamp
1044 // First implicit operand is $exec.
1045 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1046 return;
1047 }
1048
1049 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1050 if (ST.hasMovB64()) {
1051 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1052 .addReg(SrcReg, getKillRegState(KillSrc));
1053 return;
1054 }
1055 if (ST.hasPkMovB32()) {
1056 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1058 .addReg(SrcReg)
1060 .addReg(SrcReg)
1061 .addImm(0) // op_sel_lo
1062 .addImm(0) // op_sel_hi
1063 .addImm(0) // neg_lo
1064 .addImm(0) // neg_hi
1065 .addImm(0) // clamp
1066 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1067 return;
1068 }
1069 }
1070
1071 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1072 if (RI.isSGPRClass(RC)) {
1073 if (!RI.isSGPRClass(SrcRC)) {
1074 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1075 return;
1076 }
1077 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1078 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1079 Forward);
1080 return;
1081 }
1082
1083 unsigned EltSize = 4;
1084 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1085 if (RI.isAGPRClass(RC)) {
1086 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1087 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1088 else if (RI.hasVGPRs(SrcRC) ||
1089 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1090 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1091 else
1092 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1093 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1094 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1095 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1096 (RI.isProperlyAlignedRC(*RC) &&
1097 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1098 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1099 if (ST.hasMovB64()) {
1100 Opcode = AMDGPU::V_MOV_B64_e32;
1101 EltSize = 8;
1102 } else if (ST.hasPkMovB32()) {
1103 Opcode = AMDGPU::V_PK_MOV_B32;
1104 EltSize = 8;
1105 }
1106 }
1107
1108 // For the cases where we need an intermediate instruction/temporary register
1109 // (destination is an AGPR), we need a scavenger.
1110 //
1111 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1112 // whole block for every handled copy.
1113 std::unique_ptr<RegScavenger> RS;
1114 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1115 RS = std::make_unique<RegScavenger>();
1116
1117 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1118
1119 // If there is an overlap, we can't kill the super-register on the last
1120 // instruction, since it will also kill the components made live by this def.
1121 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1122 const bool CanKillSuperReg = KillSrc && !Overlap;
1123
1124 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1125 unsigned SubIdx;
1126 if (Forward)
1127 SubIdx = SubIndices[Idx];
1128 else
1129 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1130 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1131 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1132 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1133
1134 bool IsFirstSubreg = Idx == 0;
1135 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1136
1137 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1138 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1139 Register ImpUseSuper = SrcReg;
1140 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1141 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1142 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1144 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1146 .addReg(SrcSubReg)
1148 .addReg(SrcSubReg)
1149 .addImm(0) // op_sel_lo
1150 .addImm(0) // op_sel_hi
1151 .addImm(0) // neg_lo
1152 .addImm(0) // neg_hi
1153 .addImm(0) // clamp
1154 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1155 if (IsFirstSubreg)
1157 } else {
1158 MachineInstrBuilder Builder =
1159 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1160 if (IsFirstSubreg)
1161 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1162
1163 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1164 }
1165 }
1166}
1167
1168int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1169 int NewOpc;
1170
1171 // Try to map original to commuted opcode
1172 NewOpc = AMDGPU::getCommuteRev(Opcode);
1173 if (NewOpc != -1)
1174 // Check if the commuted (REV) opcode exists on the target.
1175 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1176
1177 // Try to map commuted to original opcode
1178 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1179 if (NewOpc != -1)
1180 // Check if the original (non-REV) opcode exists on the target.
1181 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1182
1183 return Opcode;
1184}
1185
1186const TargetRegisterClass *
1188 return &AMDGPU::VGPR_32RegClass;
1189}
1190
1193 const DebugLoc &DL, Register DstReg,
1195 Register TrueReg,
1196 Register FalseReg) const {
1197 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1198 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1200 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1201 "Not a VGPR32 reg");
1202
1203 if (Cond.size() == 1) {
1204 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1205 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1206 .add(Cond[0]);
1207 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1208 .addImm(0)
1209 .addReg(FalseReg)
1210 .addImm(0)
1211 .addReg(TrueReg)
1212 .addReg(SReg);
1213 } else if (Cond.size() == 2) {
1214 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1215 switch (Cond[0].getImm()) {
1216 case SIInstrInfo::SCC_TRUE: {
1217 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1218 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1219 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1220 .addImm(0)
1221 .addReg(FalseReg)
1222 .addImm(0)
1223 .addReg(TrueReg)
1224 .addReg(SReg);
1225 break;
1226 }
1227 case SIInstrInfo::SCC_FALSE: {
1228 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1229 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1230 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1231 .addImm(0)
1232 .addReg(FalseReg)
1233 .addImm(0)
1234 .addReg(TrueReg)
1235 .addReg(SReg);
1236 break;
1237 }
1238 case SIInstrInfo::VCCNZ: {
1239 MachineOperand RegOp = Cond[1];
1240 RegOp.setImplicit(false);
1241 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1242 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1243 .add(RegOp);
1244 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1245 .addImm(0)
1246 .addReg(FalseReg)
1247 .addImm(0)
1248 .addReg(TrueReg)
1249 .addReg(SReg);
1250 break;
1251 }
1252 case SIInstrInfo::VCCZ: {
1253 MachineOperand RegOp = Cond[1];
1254 RegOp.setImplicit(false);
1255 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1256 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1257 .add(RegOp);
1258 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1259 .addImm(0)
1260 .addReg(TrueReg)
1261 .addImm(0)
1262 .addReg(FalseReg)
1263 .addReg(SReg);
1264 break;
1265 }
1266 case SIInstrInfo::EXECNZ: {
1267 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1268 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1269 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1270 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1271 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1272 .addImm(0)
1273 .addReg(FalseReg)
1274 .addImm(0)
1275 .addReg(TrueReg)
1276 .addReg(SReg);
1277 break;
1278 }
1279 case SIInstrInfo::EXECZ: {
1280 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1281 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1282 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1283 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1284 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1285 .addImm(0)
1286 .addReg(FalseReg)
1287 .addImm(0)
1288 .addReg(TrueReg)
1289 .addReg(SReg);
1290 llvm_unreachable("Unhandled branch predicate EXECZ");
1291 break;
1292 }
1293 default:
1294 llvm_unreachable("invalid branch predicate");
1295 }
1296 } else {
1297 llvm_unreachable("Can only handle Cond size 1 or 2");
1298 }
1299}
1300
1303 const DebugLoc &DL,
1304 Register SrcReg, int Value) const {
1305 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1306 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1307 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1308 .addImm(Value)
1309 .addReg(SrcReg);
1310
1311 return Reg;
1312}
1313
1316 const DebugLoc &DL,
1317 Register SrcReg, int Value) const {
1318 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1319 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1320 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1321 .addImm(Value)
1322 .addReg(SrcReg);
1323
1324 return Reg;
1325}
1326
1328 const Register Reg,
1329 int64_t &ImmVal) const {
1330 switch (MI.getOpcode()) {
1331 case AMDGPU::V_MOV_B32_e32:
1332 case AMDGPU::S_MOV_B32:
1333 case AMDGPU::S_MOVK_I32:
1334 case AMDGPU::S_MOV_B64:
1335 case AMDGPU::V_MOV_B64_e32:
1336 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1337 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1338 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1339 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1340 case AMDGPU::V_MOV_B64_PSEUDO: {
1341 const MachineOperand &Src0 = MI.getOperand(1);
1342 if (Src0.isImm()) {
1343 ImmVal = Src0.getImm();
1344 return MI.getOperand(0).getReg() == Reg;
1345 }
1346
1347 return false;
1348 }
1349 case AMDGPU::S_BREV_B32:
1350 case AMDGPU::V_BFREV_B32_e32:
1351 case AMDGPU::V_BFREV_B32_e64: {
1352 const MachineOperand &Src0 = MI.getOperand(1);
1353 if (Src0.isImm()) {
1354 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1355 return MI.getOperand(0).getReg() == Reg;
1356 }
1357
1358 return false;
1359 }
1360 case AMDGPU::S_NOT_B32:
1361 case AMDGPU::V_NOT_B32_e32:
1362 case AMDGPU::V_NOT_B32_e64: {
1363 const MachineOperand &Src0 = MI.getOperand(1);
1364 if (Src0.isImm()) {
1365 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1366 return MI.getOperand(0).getReg() == Reg;
1367 }
1368
1369 return false;
1370 }
1371 default:
1372 return false;
1373 }
1374}
1375
1377
1378 if (RI.isAGPRClass(DstRC))
1379 return AMDGPU::COPY;
1380 if (RI.getRegSizeInBits(*DstRC) == 16) {
1381 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1382 // before RA.
1383 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1384 }
1385 if (RI.getRegSizeInBits(*DstRC) == 32)
1386 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1387 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1388 return AMDGPU::S_MOV_B64;
1389 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1390 return AMDGPU::V_MOV_B64_PSEUDO;
1391 return AMDGPU::COPY;
1392}
1393
1394const MCInstrDesc &
1396 bool IsIndirectSrc) const {
1397 if (IsIndirectSrc) {
1398 if (VecSize <= 32) // 4 bytes
1399 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1400 if (VecSize <= 64) // 8 bytes
1401 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1402 if (VecSize <= 96) // 12 bytes
1403 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1404 if (VecSize <= 128) // 16 bytes
1405 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1406 if (VecSize <= 160) // 20 bytes
1407 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1408 if (VecSize <= 256) // 32 bytes
1409 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1410 if (VecSize <= 288) // 36 bytes
1411 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1412 if (VecSize <= 320) // 40 bytes
1413 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1414 if (VecSize <= 352) // 44 bytes
1415 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1416 if (VecSize <= 384) // 48 bytes
1417 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1418 if (VecSize <= 512) // 64 bytes
1419 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1420 if (VecSize <= 1024) // 128 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1422
1423 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1424 }
1425
1426 if (VecSize <= 32) // 4 bytes
1427 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1428 if (VecSize <= 64) // 8 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1430 if (VecSize <= 96) // 12 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1432 if (VecSize <= 128) // 16 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1434 if (VecSize <= 160) // 20 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1436 if (VecSize <= 256) // 32 bytes
1437 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1438 if (VecSize <= 288) // 36 bytes
1439 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1440 if (VecSize <= 320) // 40 bytes
1441 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1442 if (VecSize <= 352) // 44 bytes
1443 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1444 if (VecSize <= 384) // 48 bytes
1445 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1446 if (VecSize <= 512) // 64 bytes
1447 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1448 if (VecSize <= 1024) // 128 bytes
1449 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1450
1451 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1452}
1453
1454static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1455 if (VecSize <= 32) // 4 bytes
1456 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1457 if (VecSize <= 64) // 8 bytes
1458 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1459 if (VecSize <= 96) // 12 bytes
1460 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1461 if (VecSize <= 128) // 16 bytes
1462 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1463 if (VecSize <= 160) // 20 bytes
1464 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1465 if (VecSize <= 256) // 32 bytes
1466 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1467 if (VecSize <= 288) // 36 bytes
1468 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1469 if (VecSize <= 320) // 40 bytes
1470 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1471 if (VecSize <= 352) // 44 bytes
1472 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1473 if (VecSize <= 384) // 48 bytes
1474 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1475 if (VecSize <= 512) // 64 bytes
1476 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1477 if (VecSize <= 1024) // 128 bytes
1478 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1479
1480 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1481}
1482
1483static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1484 if (VecSize <= 32) // 4 bytes
1485 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1486 if (VecSize <= 64) // 8 bytes
1487 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1488 if (VecSize <= 96) // 12 bytes
1489 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1490 if (VecSize <= 128) // 16 bytes
1491 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1492 if (VecSize <= 160) // 20 bytes
1493 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1494 if (VecSize <= 256) // 32 bytes
1495 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1496 if (VecSize <= 288) // 36 bytes
1497 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1498 if (VecSize <= 320) // 40 bytes
1499 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1500 if (VecSize <= 352) // 44 bytes
1501 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1502 if (VecSize <= 384) // 48 bytes
1503 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1504 if (VecSize <= 512) // 64 bytes
1505 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1506 if (VecSize <= 1024) // 128 bytes
1507 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1508
1509 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1510}
1511
1512static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1513 if (VecSize <= 64) // 8 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1515 if (VecSize <= 128) // 16 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1517 if (VecSize <= 256) // 32 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1519 if (VecSize <= 512) // 64 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1521 if (VecSize <= 1024) // 128 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1523
1524 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1525}
1526
1527const MCInstrDesc &
1528SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1529 bool IsSGPR) const {
1530 if (IsSGPR) {
1531 switch (EltSize) {
1532 case 32:
1533 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1534 case 64:
1535 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1536 default:
1537 llvm_unreachable("invalid reg indexing elt size");
1538 }
1539 }
1540
1541 assert(EltSize == 32 && "invalid reg indexing elt size");
1543}
1544
1545static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1546 switch (Size) {
1547 case 4:
1548 return AMDGPU::SI_SPILL_S32_SAVE;
1549 case 8:
1550 return AMDGPU::SI_SPILL_S64_SAVE;
1551 case 12:
1552 return AMDGPU::SI_SPILL_S96_SAVE;
1553 case 16:
1554 return AMDGPU::SI_SPILL_S128_SAVE;
1555 case 20:
1556 return AMDGPU::SI_SPILL_S160_SAVE;
1557 case 24:
1558 return AMDGPU::SI_SPILL_S192_SAVE;
1559 case 28:
1560 return AMDGPU::SI_SPILL_S224_SAVE;
1561 case 32:
1562 return AMDGPU::SI_SPILL_S256_SAVE;
1563 case 36:
1564 return AMDGPU::SI_SPILL_S288_SAVE;
1565 case 40:
1566 return AMDGPU::SI_SPILL_S320_SAVE;
1567 case 44:
1568 return AMDGPU::SI_SPILL_S352_SAVE;
1569 case 48:
1570 return AMDGPU::SI_SPILL_S384_SAVE;
1571 case 64:
1572 return AMDGPU::SI_SPILL_S512_SAVE;
1573 case 128:
1574 return AMDGPU::SI_SPILL_S1024_SAVE;
1575 default:
1576 llvm_unreachable("unknown register size");
1577 }
1578}
1579
1580static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1581 switch (Size) {
1582 case 2:
1583 return AMDGPU::SI_SPILL_V16_SAVE;
1584 case 4:
1585 return AMDGPU::SI_SPILL_V32_SAVE;
1586 case 8:
1587 return AMDGPU::SI_SPILL_V64_SAVE;
1588 case 12:
1589 return AMDGPU::SI_SPILL_V96_SAVE;
1590 case 16:
1591 return AMDGPU::SI_SPILL_V128_SAVE;
1592 case 20:
1593 return AMDGPU::SI_SPILL_V160_SAVE;
1594 case 24:
1595 return AMDGPU::SI_SPILL_V192_SAVE;
1596 case 28:
1597 return AMDGPU::SI_SPILL_V224_SAVE;
1598 case 32:
1599 return AMDGPU::SI_SPILL_V256_SAVE;
1600 case 36:
1601 return AMDGPU::SI_SPILL_V288_SAVE;
1602 case 40:
1603 return AMDGPU::SI_SPILL_V320_SAVE;
1604 case 44:
1605 return AMDGPU::SI_SPILL_V352_SAVE;
1606 case 48:
1607 return AMDGPU::SI_SPILL_V384_SAVE;
1608 case 64:
1609 return AMDGPU::SI_SPILL_V512_SAVE;
1610 case 128:
1611 return AMDGPU::SI_SPILL_V1024_SAVE;
1612 default:
1613 llvm_unreachable("unknown register size");
1614 }
1615}
1616
1617static unsigned getAVSpillSaveOpcode(unsigned Size) {
1618 switch (Size) {
1619 case 4:
1620 return AMDGPU::SI_SPILL_AV32_SAVE;
1621 case 8:
1622 return AMDGPU::SI_SPILL_AV64_SAVE;
1623 case 12:
1624 return AMDGPU::SI_SPILL_AV96_SAVE;
1625 case 16:
1626 return AMDGPU::SI_SPILL_AV128_SAVE;
1627 case 20:
1628 return AMDGPU::SI_SPILL_AV160_SAVE;
1629 case 24:
1630 return AMDGPU::SI_SPILL_AV192_SAVE;
1631 case 28:
1632 return AMDGPU::SI_SPILL_AV224_SAVE;
1633 case 32:
1634 return AMDGPU::SI_SPILL_AV256_SAVE;
1635 case 36:
1636 return AMDGPU::SI_SPILL_AV288_SAVE;
1637 case 40:
1638 return AMDGPU::SI_SPILL_AV320_SAVE;
1639 case 44:
1640 return AMDGPU::SI_SPILL_AV352_SAVE;
1641 case 48:
1642 return AMDGPU::SI_SPILL_AV384_SAVE;
1643 case 64:
1644 return AMDGPU::SI_SPILL_AV512_SAVE;
1645 case 128:
1646 return AMDGPU::SI_SPILL_AV1024_SAVE;
1647 default:
1648 llvm_unreachable("unknown register size");
1649 }
1650}
1651
1652static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1653 bool IsVectorSuperClass) {
1654 // Currently, there is only 32-bit WWM register spills needed.
1655 if (Size != 4)
1656 llvm_unreachable("unknown wwm register spill size");
1657
1658 if (IsVectorSuperClass)
1659 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1660
1661 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1662}
1663
1665 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1666 const SIMachineFunctionInfo &MFI) const {
1667 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1668
1669 // Choose the right opcode if spilling a WWM register.
1671 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1672
1673 // TODO: Check if AGPRs are available
1674 if (ST.hasMAIInsts())
1675 return getAVSpillSaveOpcode(Size);
1676
1678}
1679
1682 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1683 const TargetRegisterInfo *TRI, Register VReg,
1684 MachineInstr::MIFlag Flags) const {
1685 MachineFunction *MF = MBB.getParent();
1687 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1688 const DebugLoc &DL = MBB.findDebugLoc(MI);
1689
1690 MachinePointerInfo PtrInfo
1691 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1693 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1694 FrameInfo.getObjectAlign(FrameIndex));
1695 unsigned SpillSize = TRI->getSpillSize(*RC);
1696
1698 if (RI.isSGPRClass(RC)) {
1699 MFI->setHasSpilledSGPRs();
1700 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1701 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1702 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1703
1704 // We are only allowed to create one new instruction when spilling
1705 // registers, so we need to use pseudo instruction for spilling SGPRs.
1706 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1707
1708 // The SGPR spill/restore instructions only work on number sgprs, so we need
1709 // to make sure we are using the correct register class.
1710 if (SrcReg.isVirtual() && SpillSize == 4) {
1711 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1712 }
1713
1714 BuildMI(MBB, MI, DL, OpDesc)
1715 .addReg(SrcReg, getKillRegState(isKill)) // data
1716 .addFrameIndex(FrameIndex) // addr
1717 .addMemOperand(MMO)
1719
1720 if (RI.spillSGPRToVGPR())
1721 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1722 return;
1723 }
1724
1725 unsigned Opcode =
1726 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1727 MFI->setHasSpilledVGPRs();
1728
1729 BuildMI(MBB, MI, DL, get(Opcode))
1730 .addReg(SrcReg, getKillRegState(isKill)) // data
1731 .addFrameIndex(FrameIndex) // addr
1732 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1733 .addImm(0) // offset
1734 .addMemOperand(MMO);
1735}
1736
1737static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1738 switch (Size) {
1739 case 4:
1740 return AMDGPU::SI_SPILL_S32_RESTORE;
1741 case 8:
1742 return AMDGPU::SI_SPILL_S64_RESTORE;
1743 case 12:
1744 return AMDGPU::SI_SPILL_S96_RESTORE;
1745 case 16:
1746 return AMDGPU::SI_SPILL_S128_RESTORE;
1747 case 20:
1748 return AMDGPU::SI_SPILL_S160_RESTORE;
1749 case 24:
1750 return AMDGPU::SI_SPILL_S192_RESTORE;
1751 case 28:
1752 return AMDGPU::SI_SPILL_S224_RESTORE;
1753 case 32:
1754 return AMDGPU::SI_SPILL_S256_RESTORE;
1755 case 36:
1756 return AMDGPU::SI_SPILL_S288_RESTORE;
1757 case 40:
1758 return AMDGPU::SI_SPILL_S320_RESTORE;
1759 case 44:
1760 return AMDGPU::SI_SPILL_S352_RESTORE;
1761 case 48:
1762 return AMDGPU::SI_SPILL_S384_RESTORE;
1763 case 64:
1764 return AMDGPU::SI_SPILL_S512_RESTORE;
1765 case 128:
1766 return AMDGPU::SI_SPILL_S1024_RESTORE;
1767 default:
1768 llvm_unreachable("unknown register size");
1769 }
1770}
1771
1772static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1773 switch (Size) {
1774 case 2:
1775 return AMDGPU::SI_SPILL_V16_RESTORE;
1776 case 4:
1777 return AMDGPU::SI_SPILL_V32_RESTORE;
1778 case 8:
1779 return AMDGPU::SI_SPILL_V64_RESTORE;
1780 case 12:
1781 return AMDGPU::SI_SPILL_V96_RESTORE;
1782 case 16:
1783 return AMDGPU::SI_SPILL_V128_RESTORE;
1784 case 20:
1785 return AMDGPU::SI_SPILL_V160_RESTORE;
1786 case 24:
1787 return AMDGPU::SI_SPILL_V192_RESTORE;
1788 case 28:
1789 return AMDGPU::SI_SPILL_V224_RESTORE;
1790 case 32:
1791 return AMDGPU::SI_SPILL_V256_RESTORE;
1792 case 36:
1793 return AMDGPU::SI_SPILL_V288_RESTORE;
1794 case 40:
1795 return AMDGPU::SI_SPILL_V320_RESTORE;
1796 case 44:
1797 return AMDGPU::SI_SPILL_V352_RESTORE;
1798 case 48:
1799 return AMDGPU::SI_SPILL_V384_RESTORE;
1800 case 64:
1801 return AMDGPU::SI_SPILL_V512_RESTORE;
1802 case 128:
1803 return AMDGPU::SI_SPILL_V1024_RESTORE;
1804 default:
1805 llvm_unreachable("unknown register size");
1806 }
1807}
1808
1809static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1810 switch (Size) {
1811 case 4:
1812 return AMDGPU::SI_SPILL_AV32_RESTORE;
1813 case 8:
1814 return AMDGPU::SI_SPILL_AV64_RESTORE;
1815 case 12:
1816 return AMDGPU::SI_SPILL_AV96_RESTORE;
1817 case 16:
1818 return AMDGPU::SI_SPILL_AV128_RESTORE;
1819 case 20:
1820 return AMDGPU::SI_SPILL_AV160_RESTORE;
1821 case 24:
1822 return AMDGPU::SI_SPILL_AV192_RESTORE;
1823 case 28:
1824 return AMDGPU::SI_SPILL_AV224_RESTORE;
1825 case 32:
1826 return AMDGPU::SI_SPILL_AV256_RESTORE;
1827 case 36:
1828 return AMDGPU::SI_SPILL_AV288_RESTORE;
1829 case 40:
1830 return AMDGPU::SI_SPILL_AV320_RESTORE;
1831 case 44:
1832 return AMDGPU::SI_SPILL_AV352_RESTORE;
1833 case 48:
1834 return AMDGPU::SI_SPILL_AV384_RESTORE;
1835 case 64:
1836 return AMDGPU::SI_SPILL_AV512_RESTORE;
1837 case 128:
1838 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1839 default:
1840 llvm_unreachable("unknown register size");
1841 }
1842}
1843
1844static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1845 bool IsVectorSuperClass) {
1846 // Currently, there is only 32-bit WWM register spills needed.
1847 if (Size != 4)
1848 llvm_unreachable("unknown wwm register spill size");
1849
1850 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1851 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1852
1853 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1854}
1855
1857 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1858 const SIMachineFunctionInfo &MFI) const {
1859 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1860
1861 // Choose the right opcode if restoring a WWM register.
1863 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1864
1865 // TODO: Check if AGPRs are available
1866 if (ST.hasMAIInsts())
1868
1869 assert(!RI.isAGPRClass(RC));
1871}
1872
1875 Register DestReg, int FrameIndex,
1876 const TargetRegisterClass *RC,
1877 const TargetRegisterInfo *TRI,
1878 Register VReg,
1879 MachineInstr::MIFlag Flags) const {
1880 MachineFunction *MF = MBB.getParent();
1882 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1883 const DebugLoc &DL = MBB.findDebugLoc(MI);
1884 unsigned SpillSize = TRI->getSpillSize(*RC);
1885
1886 MachinePointerInfo PtrInfo
1887 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1888
1890 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1891 FrameInfo.getObjectAlign(FrameIndex));
1892
1893 if (RI.isSGPRClass(RC)) {
1894 MFI->setHasSpilledSGPRs();
1895 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1896 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1897 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1898
1899 // FIXME: Maybe this should not include a memoperand because it will be
1900 // lowered to non-memory instructions.
1901 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1902 if (DestReg.isVirtual() && SpillSize == 4) {
1904 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1905 }
1906
1907 if (RI.spillSGPRToVGPR())
1908 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1909 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1910 .addFrameIndex(FrameIndex) // addr
1911 .addMemOperand(MMO)
1913
1914 return;
1915 }
1916
1917 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1918 SpillSize, *MFI);
1919 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1920 .addFrameIndex(FrameIndex) // vaddr
1921 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1922 .addImm(0) // offset
1923 .addMemOperand(MMO);
1924}
1925
1930
1933 unsigned Quantity) const {
1934 DebugLoc DL = MBB.findDebugLoc(MI);
1935 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1936 while (Quantity > 0) {
1937 unsigned Arg = std::min(Quantity, MaxSNopCount);
1938 Quantity -= Arg;
1939 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1940 }
1941}
1942
1944 auto *MF = MBB.getParent();
1945 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1946
1947 assert(Info->isEntryFunction());
1948
1949 if (MBB.succ_empty()) {
1950 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1951 if (HasNoTerminator) {
1952 if (Info->returnsVoid()) {
1953 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1954 } else {
1955 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1956 }
1957 }
1958 }
1959}
1960
1964 const DebugLoc &DL) const {
1965 MachineFunction *MF = MBB.getParent();
1966 constexpr unsigned DoorbellIDMask = 0x3ff;
1967 constexpr unsigned ECQueueWaveAbort = 0x400;
1968
1969 MachineBasicBlock *TrapBB = &MBB;
1970 MachineBasicBlock *ContBB = &MBB;
1971 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1972
1973 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1974 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1975 TrapBB = MF->CreateMachineBasicBlock();
1976 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1977 MF->push_back(TrapBB);
1978 MBB.addSuccessor(TrapBB);
1979 }
1980
1981 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1982 // will be a nop.
1983 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1984 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1985 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1986 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
1987 DoorbellReg)
1989 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
1990 .addUse(AMDGPU::M0);
1991 Register DoorbellRegMasked =
1992 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1993 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
1994 .addUse(DoorbellReg)
1995 .addImm(DoorbellIDMask);
1996 Register SetWaveAbortBit =
1997 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1998 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
1999 .addUse(DoorbellRegMasked)
2000 .addImm(ECQueueWaveAbort);
2001 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2002 .addUse(SetWaveAbortBit);
2003 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2005 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2006 .addUse(AMDGPU::TTMP2);
2007 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2008 TrapBB->addSuccessor(HaltLoopBB);
2009
2010 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2011 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2012 .addMBB(HaltLoopBB);
2013 MF->push_back(HaltLoopBB);
2014 HaltLoopBB->addSuccessor(HaltLoopBB);
2015
2016 return ContBB;
2017}
2018
2020 switch (MI.getOpcode()) {
2021 default:
2022 if (MI.isMetaInstruction())
2023 return 0;
2024 return 1; // FIXME: Do wait states equal cycles?
2025
2026 case AMDGPU::S_NOP:
2027 return MI.getOperand(0).getImm() + 1;
2028 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2029 // hazard, even if one exist, won't really be visible. Should we handle it?
2030 }
2031}
2032
2034 MachineBasicBlock &MBB = *MI.getParent();
2035 DebugLoc DL = MBB.findDebugLoc(MI);
2037 switch (MI.getOpcode()) {
2038 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2039 case AMDGPU::S_MOV_B64_term:
2040 // This is only a terminator to get the correct spill code placement during
2041 // register allocation.
2042 MI.setDesc(get(AMDGPU::S_MOV_B64));
2043 break;
2044
2045 case AMDGPU::S_MOV_B32_term:
2046 // This is only a terminator to get the correct spill code placement during
2047 // register allocation.
2048 MI.setDesc(get(AMDGPU::S_MOV_B32));
2049 break;
2050
2051 case AMDGPU::S_XOR_B64_term:
2052 // This is only a terminator to get the correct spill code placement during
2053 // register allocation.
2054 MI.setDesc(get(AMDGPU::S_XOR_B64));
2055 break;
2056
2057 case AMDGPU::S_XOR_B32_term:
2058 // This is only a terminator to get the correct spill code placement during
2059 // register allocation.
2060 MI.setDesc(get(AMDGPU::S_XOR_B32));
2061 break;
2062 case AMDGPU::S_OR_B64_term:
2063 // This is only a terminator to get the correct spill code placement during
2064 // register allocation.
2065 MI.setDesc(get(AMDGPU::S_OR_B64));
2066 break;
2067 case AMDGPU::S_OR_B32_term:
2068 // This is only a terminator to get the correct spill code placement during
2069 // register allocation.
2070 MI.setDesc(get(AMDGPU::S_OR_B32));
2071 break;
2072
2073 case AMDGPU::S_ANDN2_B64_term:
2074 // This is only a terminator to get the correct spill code placement during
2075 // register allocation.
2076 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2077 break;
2078
2079 case AMDGPU::S_ANDN2_B32_term:
2080 // This is only a terminator to get the correct spill code placement during
2081 // register allocation.
2082 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2083 break;
2084
2085 case AMDGPU::S_AND_B64_term:
2086 // This is only a terminator to get the correct spill code placement during
2087 // register allocation.
2088 MI.setDesc(get(AMDGPU::S_AND_B64));
2089 break;
2090
2091 case AMDGPU::S_AND_B32_term:
2092 // This is only a terminator to get the correct spill code placement during
2093 // register allocation.
2094 MI.setDesc(get(AMDGPU::S_AND_B32));
2095 break;
2096
2097 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2098 // This is only a terminator to get the correct spill code placement during
2099 // register allocation.
2100 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2101 break;
2102
2103 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2104 // This is only a terminator to get the correct spill code placement during
2105 // register allocation.
2106 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2107 break;
2108
2109 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2110 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2111 break;
2112
2113 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2114 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2115 break;
2116 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2117 Register Dst = MI.getOperand(0).getReg();
2118 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2119 MI.setDesc(
2120 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2121 break;
2122 }
2123 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2124 Register Dst = MI.getOperand(0).getReg();
2125 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2126 int64_t Imm = MI.getOperand(1).getImm();
2127
2128 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2129 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2130 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2133 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2134 .addImm(SignExtend64<32>(Imm >> 32))
2136 MI.eraseFromParent();
2137 break;
2138 }
2139
2140 [[fallthrough]];
2141 }
2142 case AMDGPU::V_MOV_B64_PSEUDO: {
2143 Register Dst = MI.getOperand(0).getReg();
2144 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2145 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2146
2147 const MachineOperand &SrcOp = MI.getOperand(1);
2148 // FIXME: Will this work for 64-bit floating point immediates?
2149 assert(!SrcOp.isFPImm());
2150 if (ST.hasMovB64()) {
2151 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2152 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2153 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2154 break;
2155 }
2156 if (SrcOp.isImm()) {
2157 APInt Imm(64, SrcOp.getImm());
2158 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2159 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2160 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2161 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2163 .addImm(Lo.getSExtValue())
2165 .addImm(Lo.getSExtValue())
2166 .addImm(0) // op_sel_lo
2167 .addImm(0) // op_sel_hi
2168 .addImm(0) // neg_lo
2169 .addImm(0) // neg_hi
2170 .addImm(0); // clamp
2171 } else {
2172 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2173 .addImm(Lo.getSExtValue())
2175 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2176 .addImm(Hi.getSExtValue())
2178 }
2179 } else {
2180 assert(SrcOp.isReg());
2181 if (ST.hasPkMovB32() &&
2182 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2183 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2184 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2185 .addReg(SrcOp.getReg())
2187 .addReg(SrcOp.getReg())
2188 .addImm(0) // op_sel_lo
2189 .addImm(0) // op_sel_hi
2190 .addImm(0) // neg_lo
2191 .addImm(0) // neg_hi
2192 .addImm(0); // clamp
2193 } else {
2194 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2195 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2197 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2198 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2200 }
2201 }
2202 MI.eraseFromParent();
2203 break;
2204 }
2205 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2207 break;
2208 }
2209 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2210 const MachineOperand &SrcOp = MI.getOperand(1);
2211 assert(!SrcOp.isFPImm());
2212
2213 if (ST.has64BitLiterals()) {
2214 MI.setDesc(get(AMDGPU::S_MOV_B64));
2215 break;
2216 }
2217
2218 APInt Imm(64, SrcOp.getImm());
2219 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2220 MI.setDesc(get(AMDGPU::S_MOV_B64));
2221 break;
2222 }
2223
2224 Register Dst = MI.getOperand(0).getReg();
2225 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2226 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2227
2228 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2229 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2230 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2231 .addImm(Lo.getSExtValue())
2233 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2234 .addImm(Hi.getSExtValue())
2236 MI.eraseFromParent();
2237 break;
2238 }
2239 case AMDGPU::V_SET_INACTIVE_B32: {
2240 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2241 Register DstReg = MI.getOperand(0).getReg();
2242 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2243 .add(MI.getOperand(3))
2244 .add(MI.getOperand(4))
2245 .add(MI.getOperand(1))
2246 .add(MI.getOperand(2))
2247 .add(MI.getOperand(5));
2248 MI.eraseFromParent();
2249 break;
2250 }
2251 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2252 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2253 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2254 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2255 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2256 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2257 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2258 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2259 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2260 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2261 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2262 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2263 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2264 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2265 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2266 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2267 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2268 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2269 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2270 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2271 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2272 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2273 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2274 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2275 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2276 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2277 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2278 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2279 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2280 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2281
2282 unsigned Opc;
2283 if (RI.hasVGPRs(EltRC)) {
2284 Opc = AMDGPU::V_MOVRELD_B32_e32;
2285 } else {
2286 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2287 : AMDGPU::S_MOVRELD_B32;
2288 }
2289
2290 const MCInstrDesc &OpDesc = get(Opc);
2291 Register VecReg = MI.getOperand(0).getReg();
2292 bool IsUndef = MI.getOperand(1).isUndef();
2293 unsigned SubReg = MI.getOperand(3).getImm();
2294 assert(VecReg == MI.getOperand(1).getReg());
2295
2297 BuildMI(MBB, MI, DL, OpDesc)
2298 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2299 .add(MI.getOperand(2))
2301 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2302
2303 const int ImpDefIdx =
2304 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2305 const int ImpUseIdx = ImpDefIdx + 1;
2306 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2307 MI.eraseFromParent();
2308 break;
2309 }
2310 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2311 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2312 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2313 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2314 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2315 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2316 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2317 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2318 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2319 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2320 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2321 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2322 assert(ST.useVGPRIndexMode());
2323 Register VecReg = MI.getOperand(0).getReg();
2324 bool IsUndef = MI.getOperand(1).isUndef();
2325 MachineOperand &Idx = MI.getOperand(3);
2326 Register SubReg = MI.getOperand(4).getImm();
2327
2328 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2329 .add(Idx)
2331 SetOn->getOperand(3).setIsUndef();
2332
2333 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2335 BuildMI(MBB, MI, DL, OpDesc)
2336 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2337 .add(MI.getOperand(2))
2339 .addReg(VecReg,
2340 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2341
2342 const int ImpDefIdx =
2343 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2344 const int ImpUseIdx = ImpDefIdx + 1;
2345 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2346
2347 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2348
2349 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2350
2351 MI.eraseFromParent();
2352 break;
2353 }
2354 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2355 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2356 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2357 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2358 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2359 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2360 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2361 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2362 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2363 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2364 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2365 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2366 assert(ST.useVGPRIndexMode());
2367 Register Dst = MI.getOperand(0).getReg();
2368 Register VecReg = MI.getOperand(1).getReg();
2369 bool IsUndef = MI.getOperand(1).isUndef();
2370 Register Idx = MI.getOperand(2).getReg();
2371 Register SubReg = MI.getOperand(3).getImm();
2372
2373 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2374 .addReg(Idx)
2376 SetOn->getOperand(3).setIsUndef();
2377
2378 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2379 .addDef(Dst)
2380 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2381 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2382
2383 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2384
2385 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2386
2387 MI.eraseFromParent();
2388 break;
2389 }
2390 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2391 MachineFunction &MF = *MBB.getParent();
2392 Register Reg = MI.getOperand(0).getReg();
2393 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2394 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2395 MachineOperand OpLo = MI.getOperand(1);
2396 MachineOperand OpHi = MI.getOperand(2);
2397
2398 // Create a bundle so these instructions won't be re-ordered by the
2399 // post-RA scheduler.
2400 MIBundleBuilder Bundler(MBB, MI);
2401 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2402
2403 // What we want here is an offset from the value returned by s_getpc (which
2404 // is the address of the s_add_u32 instruction) to the global variable, but
2405 // since the encoding of $symbol starts 4 bytes after the start of the
2406 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2407 // small. This requires us to add 4 to the global variable offset in order
2408 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2409 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2410 // instruction.
2411
2412 int64_t Adjust = 0;
2413 if (ST.hasGetPCZeroExtension()) {
2414 // Fix up hardware that does not sign-extend the 48-bit PC value by
2415 // inserting: s_sext_i32_i16 reghi, reghi
2416 Bundler.append(
2417 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2418 Adjust += 4;
2419 }
2420
2421 if (OpLo.isGlobal())
2422 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2423 Bundler.append(
2424 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2425
2426 if (OpHi.isGlobal())
2427 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2428 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2429 .addReg(RegHi)
2430 .add(OpHi));
2431
2432 finalizeBundle(MBB, Bundler.begin());
2433
2434 MI.eraseFromParent();
2435 break;
2436 }
2437 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2438 MachineFunction &MF = *MBB.getParent();
2439 Register Reg = MI.getOperand(0).getReg();
2440 MachineOperand Op = MI.getOperand(1);
2441
2442 // Create a bundle so these instructions won't be re-ordered by the
2443 // post-RA scheduler.
2444 MIBundleBuilder Bundler(MBB, MI);
2445 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2446 if (Op.isGlobal())
2447 Op.setOffset(Op.getOffset() + 4);
2448 Bundler.append(
2449 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2450
2451 finalizeBundle(MBB, Bundler.begin());
2452
2453 MI.eraseFromParent();
2454 break;
2455 }
2456 case AMDGPU::ENTER_STRICT_WWM: {
2457 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2458 // Whole Wave Mode is entered.
2459 MI.setDesc(get(LMC.OrSaveExecOpc));
2460 break;
2461 }
2462 case AMDGPU::ENTER_STRICT_WQM: {
2463 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2464 // STRICT_WQM is entered.
2465 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2466 .addReg(LMC.ExecReg);
2467 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2468
2469 MI.eraseFromParent();
2470 break;
2471 }
2472 case AMDGPU::EXIT_STRICT_WWM:
2473 case AMDGPU::EXIT_STRICT_WQM: {
2474 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2475 // WWM/STICT_WQM is exited.
2476 MI.setDesc(get(LMC.MovOpc));
2477 break;
2478 }
2479 case AMDGPU::SI_RETURN: {
2480 const MachineFunction *MF = MBB.getParent();
2481 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2482 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2483 // Hiding the return address use with SI_RETURN may lead to extra kills in
2484 // the function and missing live-ins. We are fine in practice because callee
2485 // saved register handling ensures the register value is restored before
2486 // RET, but we need the undef flag here to appease the MachineVerifier
2487 // liveness checks.
2489 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2490 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2491
2492 MIB.copyImplicitOps(MI);
2493 MI.eraseFromParent();
2494 break;
2495 }
2496
2497 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2498 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2499 MI.setDesc(get(AMDGPU::S_MUL_U64));
2500 break;
2501
2502 case AMDGPU::S_GETPC_B64_pseudo:
2503 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2504 if (ST.hasGetPCZeroExtension()) {
2505 Register Dst = MI.getOperand(0).getReg();
2506 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2507 // Fix up hardware that does not sign-extend the 48-bit PC value by
2508 // inserting: s_sext_i32_i16 dsthi, dsthi
2509 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2510 DstHi)
2511 .addReg(DstHi);
2512 }
2513 break;
2514
2515 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2516 assert(ST.hasBF16PackedInsts());
2517 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2518 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2519 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2520 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2521 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2522 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2523 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2524 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2525 break;
2526 }
2527
2528 return true;
2529}
2530
2533 unsigned SubIdx, const MachineInstr &Orig,
2534 const TargetRegisterInfo &RI) const {
2535
2536 // Try shrinking the instruction to remat only the part needed for current
2537 // context.
2538 // TODO: Handle more cases.
2539 unsigned Opcode = Orig.getOpcode();
2540 switch (Opcode) {
2541 case AMDGPU::S_LOAD_DWORDX16_IMM:
2542 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2543 if (SubIdx != 0)
2544 break;
2545
2546 if (I == MBB.end())
2547 break;
2548
2549 if (I->isBundled())
2550 break;
2551
2552 // Look for a single use of the register that is also a subreg.
2553 Register RegToFind = Orig.getOperand(0).getReg();
2554 MachineOperand *UseMO = nullptr;
2555 for (auto &CandMO : I->operands()) {
2556 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2557 continue;
2558 if (UseMO) {
2559 UseMO = nullptr;
2560 break;
2561 }
2562 UseMO = &CandMO;
2563 }
2564 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2565 break;
2566
2567 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2568 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2569
2570 MachineFunction *MF = MBB.getParent();
2572 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2573
2574 unsigned NewOpcode = -1;
2575 if (SubregSize == 256)
2576 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2577 else if (SubregSize == 128)
2578 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2579 else
2580 break;
2581
2582 const MCInstrDesc &TID = get(NewOpcode);
2583 const TargetRegisterClass *NewRC =
2584 RI.getAllocatableClass(getRegClass(TID, 0, &RI));
2585 MRI.setRegClass(DestReg, NewRC);
2586
2587 UseMO->setReg(DestReg);
2588 UseMO->setSubReg(AMDGPU::NoSubRegister);
2589
2590 // Use a smaller load with the desired size, possibly with updated offset.
2591 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2592 MI->setDesc(TID);
2593 MI->getOperand(0).setReg(DestReg);
2594 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2595 if (Offset) {
2596 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2597 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2598 OffsetMO->setImm(FinalOffset);
2599 }
2601 for (const MachineMemOperand *MemOp : Orig.memoperands())
2602 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2603 SubregSize / 8));
2604 MI->setMemRefs(*MF, NewMMOs);
2605
2606 MBB.insert(I, MI);
2607 return;
2608 }
2609
2610 default:
2611 break;
2612 }
2613
2614 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2615}
2616
2617std::pair<MachineInstr*, MachineInstr*>
2619 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2620
2621 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2623 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2624 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2625 return std::pair(&MI, nullptr);
2626 }
2627
2628 MachineBasicBlock &MBB = *MI.getParent();
2629 DebugLoc DL = MBB.findDebugLoc(MI);
2630 MachineFunction *MF = MBB.getParent();
2632 Register Dst = MI.getOperand(0).getReg();
2633 unsigned Part = 0;
2634 MachineInstr *Split[2];
2635
2636 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2637 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2638 if (Dst.isPhysical()) {
2639 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2640 } else {
2641 assert(MRI.isSSA());
2642 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2643 MovDPP.addDef(Tmp);
2644 }
2645
2646 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2647 const MachineOperand &SrcOp = MI.getOperand(I);
2648 assert(!SrcOp.isFPImm());
2649 if (SrcOp.isImm()) {
2650 APInt Imm(64, SrcOp.getImm());
2651 Imm.ashrInPlace(Part * 32);
2652 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2653 } else {
2654 assert(SrcOp.isReg());
2655 Register Src = SrcOp.getReg();
2656 if (Src.isPhysical())
2657 MovDPP.addReg(RI.getSubReg(Src, Sub));
2658 else
2659 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2660 }
2661 }
2662
2663 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2664 MovDPP.addImm(MO.getImm());
2665
2666 Split[Part] = MovDPP;
2667 ++Part;
2668 }
2669
2670 if (Dst.isVirtual())
2671 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2672 .addReg(Split[0]->getOperand(0).getReg())
2673 .addImm(AMDGPU::sub0)
2674 .addReg(Split[1]->getOperand(0).getReg())
2675 .addImm(AMDGPU::sub1);
2676
2677 MI.eraseFromParent();
2678 return std::pair(Split[0], Split[1]);
2679}
2680
2681std::optional<DestSourcePair>
2683 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2684 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2685
2686 return std::nullopt;
2687}
2688
2690 AMDGPU::OpName Src0OpName,
2691 MachineOperand &Src1,
2692 AMDGPU::OpName Src1OpName) const {
2693 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2694 if (!Src0Mods)
2695 return false;
2696
2697 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2698 assert(Src1Mods &&
2699 "All commutable instructions have both src0 and src1 modifiers");
2700
2701 int Src0ModsVal = Src0Mods->getImm();
2702 int Src1ModsVal = Src1Mods->getImm();
2703
2704 Src1Mods->setImm(Src0ModsVal);
2705 Src0Mods->setImm(Src1ModsVal);
2706 return true;
2707}
2708
2710 MachineOperand &RegOp,
2711 MachineOperand &NonRegOp) {
2712 Register Reg = RegOp.getReg();
2713 unsigned SubReg = RegOp.getSubReg();
2714 bool IsKill = RegOp.isKill();
2715 bool IsDead = RegOp.isDead();
2716 bool IsUndef = RegOp.isUndef();
2717 bool IsDebug = RegOp.isDebug();
2718
2719 if (NonRegOp.isImm())
2720 RegOp.ChangeToImmediate(NonRegOp.getImm());
2721 else if (NonRegOp.isFI())
2722 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2723 else if (NonRegOp.isGlobal()) {
2724 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2725 NonRegOp.getTargetFlags());
2726 } else
2727 return nullptr;
2728
2729 // Make sure we don't reinterpret a subreg index in the target flags.
2730 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2731
2732 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2733 NonRegOp.setSubReg(SubReg);
2734
2735 return &MI;
2736}
2737
2739 MachineOperand &NonRegOp1,
2740 MachineOperand &NonRegOp2) {
2741 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2742 int64_t NonRegVal = NonRegOp1.getImm();
2743
2744 NonRegOp1.setImm(NonRegOp2.getImm());
2745 NonRegOp2.setImm(NonRegVal);
2746 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2747 NonRegOp2.setTargetFlags(TargetFlags);
2748 return &MI;
2749}
2750
2751bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2752 unsigned OpIdx1) const {
2753 const MCInstrDesc &InstDesc = MI.getDesc();
2754 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2755 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2756
2757 unsigned Opc = MI.getOpcode();
2758 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2759
2760 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2761 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2762
2763 // Swap doesn't breach constant bus or literal limits
2764 // It may move literal to position other than src0, this is not allowed
2765 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2766 // FIXME: After gfx9, literal can be in place other than Src0
2767 if (isVALU(MI)) {
2768 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2769 !isInlineConstant(MO0, OpInfo1))
2770 return false;
2771 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2772 !isInlineConstant(MO1, OpInfo0))
2773 return false;
2774 }
2775
2776 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2777 if (OpInfo1.RegClass == -1)
2778 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2779 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2780 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2781 }
2782 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2783 if (OpInfo0.RegClass == -1)
2784 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2785 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2786 isLegalRegOperand(MI, OpIdx0, MO1);
2787 }
2788
2789 // No need to check 64-bit literals since swapping does not bring new
2790 // 64-bit literals into current instruction to fold to 32-bit
2791
2792 return isImmOperandLegal(MI, OpIdx1, MO0);
2793}
2794
2796 unsigned Src0Idx,
2797 unsigned Src1Idx) const {
2798 assert(!NewMI && "this should never be used");
2799
2800 unsigned Opc = MI.getOpcode();
2801 int CommutedOpcode = commuteOpcode(Opc);
2802 if (CommutedOpcode == -1)
2803 return nullptr;
2804
2805 if (Src0Idx > Src1Idx)
2806 std::swap(Src0Idx, Src1Idx);
2807
2808 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2809 static_cast<int>(Src0Idx) &&
2810 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2811 static_cast<int>(Src1Idx) &&
2812 "inconsistency with findCommutedOpIndices");
2813
2814 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2815 return nullptr;
2816
2817 MachineInstr *CommutedMI = nullptr;
2818 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2819 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2820 if (Src0.isReg() && Src1.isReg()) {
2821 // Be sure to copy the source modifiers to the right place.
2822 CommutedMI =
2823 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2824 } else if (Src0.isReg() && !Src1.isReg()) {
2825 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2826 } else if (!Src0.isReg() && Src1.isReg()) {
2827 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2828 } else if (Src0.isImm() && Src1.isImm()) {
2829 CommutedMI = swapImmOperands(MI, Src0, Src1);
2830 } else {
2831 // FIXME: Found two non registers to commute. This does happen.
2832 return nullptr;
2833 }
2834
2835 if (CommutedMI) {
2836 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2837 Src1, AMDGPU::OpName::src1_modifiers);
2838
2839 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2840 AMDGPU::OpName::src1_sel);
2841
2842 CommutedMI->setDesc(get(CommutedOpcode));
2843 }
2844
2845 return CommutedMI;
2846}
2847
2848// This needs to be implemented because the source modifiers may be inserted
2849// between the true commutable operands, and the base
2850// TargetInstrInfo::commuteInstruction uses it.
2852 unsigned &SrcOpIdx0,
2853 unsigned &SrcOpIdx1) const {
2854 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2855}
2856
2858 unsigned &SrcOpIdx0,
2859 unsigned &SrcOpIdx1) const {
2860 if (!Desc.isCommutable())
2861 return false;
2862
2863 unsigned Opc = Desc.getOpcode();
2864 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2865 if (Src0Idx == -1)
2866 return false;
2867
2868 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2869 if (Src1Idx == -1)
2870 return false;
2871
2872 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2873}
2874
2876 int64_t BrOffset) const {
2877 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2878 // because its dest block is unanalyzable.
2879 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2880
2881 // Convert to dwords.
2882 BrOffset /= 4;
2883
2884 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2885 // from the next instruction.
2886 BrOffset -= 1;
2887
2888 return isIntN(BranchOffsetBits, BrOffset);
2889}
2890
2893 return MI.getOperand(0).getMBB();
2894}
2895
2897 for (const MachineInstr &MI : MBB->terminators()) {
2898 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2899 MI.getOpcode() == AMDGPU::SI_LOOP)
2900 return true;
2901 }
2902 return false;
2903}
2904
2906 MachineBasicBlock &DestBB,
2907 MachineBasicBlock &RestoreBB,
2908 const DebugLoc &DL, int64_t BrOffset,
2909 RegScavenger *RS) const {
2910 assert(MBB.empty() &&
2911 "new block should be inserted for expanding unconditional branch");
2912 assert(MBB.pred_size() == 1);
2913 assert(RestoreBB.empty() &&
2914 "restore block should be inserted for restoring clobbered registers");
2915
2916 MachineFunction *MF = MBB.getParent();
2919 auto I = MBB.end();
2920 auto &MCCtx = MF->getContext();
2921
2922 if (ST.hasAddPC64Inst()) {
2923 MCSymbol *Offset =
2924 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2925 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2927 MCSymbol *PostAddPCLabel =
2928 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2929 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2930 auto *OffsetExpr = MCBinaryExpr::createSub(
2931 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2932 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2933 Offset->setVariableValue(OffsetExpr);
2934 return;
2935 }
2936
2937 assert(RS && "RegScavenger required for long branching");
2938
2939 // FIXME: Virtual register workaround for RegScavenger not working with empty
2940 // blocks.
2941 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2942
2943 // Note: as this is used after hazard recognizer we need to apply some hazard
2944 // workarounds directly.
2945 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2946 ST.hasVALUReadSGPRHazard();
2947 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2948 if (FlushSGPRWrites)
2949 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2951 };
2952
2953 // We need to compute the offset relative to the instruction immediately after
2954 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2955 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2956 ApplyHazardWorkarounds();
2957
2958 MCSymbol *PostGetPCLabel =
2959 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2960 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2961
2962 MCSymbol *OffsetLo =
2963 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2964 MCSymbol *OffsetHi =
2965 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2966 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2967 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2968 .addReg(PCReg, 0, AMDGPU::sub0)
2969 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2970 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2971 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2972 .addReg(PCReg, 0, AMDGPU::sub1)
2973 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2974 ApplyHazardWorkarounds();
2975
2976 // Insert the indirect branch after the other terminator.
2977 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2978 .addReg(PCReg);
2979
2980 // If a spill is needed for the pc register pair, we need to insert a spill
2981 // restore block right before the destination block, and insert a short branch
2982 // into the old destination block's fallthrough predecessor.
2983 // e.g.:
2984 //
2985 // s_cbranch_scc0 skip_long_branch:
2986 //
2987 // long_branch_bb:
2988 // spill s[8:9]
2989 // s_getpc_b64 s[8:9]
2990 // s_add_u32 s8, s8, restore_bb
2991 // s_addc_u32 s9, s9, 0
2992 // s_setpc_b64 s[8:9]
2993 //
2994 // skip_long_branch:
2995 // foo;
2996 //
2997 // .....
2998 //
2999 // dest_bb_fallthrough_predecessor:
3000 // bar;
3001 // s_branch dest_bb
3002 //
3003 // restore_bb:
3004 // restore s[8:9]
3005 // fallthrough dest_bb
3006 ///
3007 // dest_bb:
3008 // buzz;
3009
3010 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3011 Register Scav;
3012
3013 // If we've previously reserved a register for long branches
3014 // avoid running the scavenger and just use those registers
3015 if (LongBranchReservedReg) {
3016 RS->enterBasicBlock(MBB);
3017 Scav = LongBranchReservedReg;
3018 } else {
3019 RS->enterBasicBlockEnd(MBB);
3020 Scav = RS->scavengeRegisterBackwards(
3021 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3022 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3023 }
3024 if (Scav) {
3025 RS->setRegUsed(Scav);
3026 MRI.replaceRegWith(PCReg, Scav);
3027 MRI.clearVirtRegs();
3028 } else {
3029 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3030 // SGPR spill.
3031 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3032 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3033 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3034 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3035 MRI.clearVirtRegs();
3036 }
3037
3038 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3039 // Now, the distance could be defined.
3041 MCSymbolRefExpr::create(DestLabel, MCCtx),
3042 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3043 // Add offset assignments.
3044 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3045 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3046 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3047 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3048}
3049
3050unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3051 switch (Cond) {
3052 case SIInstrInfo::SCC_TRUE:
3053 return AMDGPU::S_CBRANCH_SCC1;
3054 case SIInstrInfo::SCC_FALSE:
3055 return AMDGPU::S_CBRANCH_SCC0;
3056 case SIInstrInfo::VCCNZ:
3057 return AMDGPU::S_CBRANCH_VCCNZ;
3058 case SIInstrInfo::VCCZ:
3059 return AMDGPU::S_CBRANCH_VCCZ;
3060 case SIInstrInfo::EXECNZ:
3061 return AMDGPU::S_CBRANCH_EXECNZ;
3062 case SIInstrInfo::EXECZ:
3063 return AMDGPU::S_CBRANCH_EXECZ;
3064 default:
3065 llvm_unreachable("invalid branch predicate");
3066 }
3067}
3068
3069SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3070 switch (Opcode) {
3071 case AMDGPU::S_CBRANCH_SCC0:
3072 return SCC_FALSE;
3073 case AMDGPU::S_CBRANCH_SCC1:
3074 return SCC_TRUE;
3075 case AMDGPU::S_CBRANCH_VCCNZ:
3076 return VCCNZ;
3077 case AMDGPU::S_CBRANCH_VCCZ:
3078 return VCCZ;
3079 case AMDGPU::S_CBRANCH_EXECNZ:
3080 return EXECNZ;
3081 case AMDGPU::S_CBRANCH_EXECZ:
3082 return EXECZ;
3083 default:
3084 return INVALID_BR;
3085 }
3086}
3087
3091 MachineBasicBlock *&FBB,
3093 bool AllowModify) const {
3094 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3095 // Unconditional Branch
3096 TBB = I->getOperand(0).getMBB();
3097 return false;
3098 }
3099
3100 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3101 if (Pred == INVALID_BR)
3102 return true;
3103
3104 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3105 Cond.push_back(MachineOperand::CreateImm(Pred));
3106 Cond.push_back(I->getOperand(1)); // Save the branch register.
3107
3108 ++I;
3109
3110 if (I == MBB.end()) {
3111 // Conditional branch followed by fall-through.
3112 TBB = CondBB;
3113 return false;
3114 }
3115
3116 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3117 TBB = CondBB;
3118 FBB = I->getOperand(0).getMBB();
3119 return false;
3120 }
3121
3122 return true;
3123}
3124
3126 MachineBasicBlock *&FBB,
3128 bool AllowModify) const {
3129 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3130 auto E = MBB.end();
3131 if (I == E)
3132 return false;
3133
3134 // Skip over the instructions that are artificially terminators for special
3135 // exec management.
3136 while (I != E && !I->isBranch() && !I->isReturn()) {
3137 switch (I->getOpcode()) {
3138 case AMDGPU::S_MOV_B64_term:
3139 case AMDGPU::S_XOR_B64_term:
3140 case AMDGPU::S_OR_B64_term:
3141 case AMDGPU::S_ANDN2_B64_term:
3142 case AMDGPU::S_AND_B64_term:
3143 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3144 case AMDGPU::S_MOV_B32_term:
3145 case AMDGPU::S_XOR_B32_term:
3146 case AMDGPU::S_OR_B32_term:
3147 case AMDGPU::S_ANDN2_B32_term:
3148 case AMDGPU::S_AND_B32_term:
3149 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3150 break;
3151 case AMDGPU::SI_IF:
3152 case AMDGPU::SI_ELSE:
3153 case AMDGPU::SI_KILL_I1_TERMINATOR:
3154 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3155 // FIXME: It's messy that these need to be considered here at all.
3156 return true;
3157 default:
3158 llvm_unreachable("unexpected non-branch terminator inst");
3159 }
3160
3161 ++I;
3162 }
3163
3164 if (I == E)
3165 return false;
3166
3167 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3168}
3169
3171 int *BytesRemoved) const {
3172 unsigned Count = 0;
3173 unsigned RemovedSize = 0;
3174 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3175 // Skip over artificial terminators when removing instructions.
3176 if (MI.isBranch() || MI.isReturn()) {
3177 RemovedSize += getInstSizeInBytes(MI);
3178 MI.eraseFromParent();
3179 ++Count;
3180 }
3181 }
3182
3183 if (BytesRemoved)
3184 *BytesRemoved = RemovedSize;
3185
3186 return Count;
3187}
3188
3189// Copy the flags onto the implicit condition register operand.
3191 const MachineOperand &OrigCond) {
3192 CondReg.setIsUndef(OrigCond.isUndef());
3193 CondReg.setIsKill(OrigCond.isKill());
3194}
3195
3198 MachineBasicBlock *FBB,
3200 const DebugLoc &DL,
3201 int *BytesAdded) const {
3202 if (!FBB && Cond.empty()) {
3203 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3204 .addMBB(TBB);
3205 if (BytesAdded)
3206 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3207 return 1;
3208 }
3209
3210 assert(TBB && Cond[0].isImm());
3211
3212 unsigned Opcode
3213 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3214
3215 if (!FBB) {
3216 MachineInstr *CondBr =
3217 BuildMI(&MBB, DL, get(Opcode))
3218 .addMBB(TBB);
3219
3220 // Copy the flags onto the implicit condition register operand.
3221 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3222 fixImplicitOperands(*CondBr);
3223
3224 if (BytesAdded)
3225 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3226 return 1;
3227 }
3228
3229 assert(TBB && FBB);
3230
3231 MachineInstr *CondBr =
3232 BuildMI(&MBB, DL, get(Opcode))
3233 .addMBB(TBB);
3234 fixImplicitOperands(*CondBr);
3235 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3236 .addMBB(FBB);
3237
3238 MachineOperand &CondReg = CondBr->getOperand(1);
3239 CondReg.setIsUndef(Cond[1].isUndef());
3240 CondReg.setIsKill(Cond[1].isKill());
3241
3242 if (BytesAdded)
3243 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3244
3245 return 2;
3246}
3247
3250 if (Cond.size() != 2) {
3251 return true;
3252 }
3253
3254 if (Cond[0].isImm()) {
3255 Cond[0].setImm(-Cond[0].getImm());
3256 return false;
3257 }
3258
3259 return true;
3260}
3261
3264 Register DstReg, Register TrueReg,
3265 Register FalseReg, int &CondCycles,
3266 int &TrueCycles, int &FalseCycles) const {
3267 switch (Cond[0].getImm()) {
3268 case VCCNZ:
3269 case VCCZ: {
3270 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3271 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3272 if (MRI.getRegClass(FalseReg) != RC)
3273 return false;
3274
3275 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3276 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3277
3278 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3279 return RI.hasVGPRs(RC) && NumInsts <= 6;
3280 }
3281 case SCC_TRUE:
3282 case SCC_FALSE: {
3283 // FIXME: We could insert for VGPRs if we could replace the original compare
3284 // with a vector one.
3285 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3286 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3287 if (MRI.getRegClass(FalseReg) != RC)
3288 return false;
3289
3290 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3291
3292 // Multiples of 8 can do s_cselect_b64
3293 if (NumInsts % 2 == 0)
3294 NumInsts /= 2;
3295
3296 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3297 return RI.isSGPRClass(RC);
3298 }
3299 default:
3300 return false;
3301 }
3302}
3303
3307 Register TrueReg, Register FalseReg) const {
3308 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3309 if (Pred == VCCZ || Pred == SCC_FALSE) {
3310 Pred = static_cast<BranchPredicate>(-Pred);
3311 std::swap(TrueReg, FalseReg);
3312 }
3313
3314 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3315 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3316 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3317
3318 if (DstSize == 32) {
3320 if (Pred == SCC_TRUE) {
3321 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3322 .addReg(TrueReg)
3323 .addReg(FalseReg);
3324 } else {
3325 // Instruction's operands are backwards from what is expected.
3326 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3327 .addReg(FalseReg)
3328 .addReg(TrueReg);
3329 }
3330
3331 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3332 return;
3333 }
3334
3335 if (DstSize == 64 && Pred == SCC_TRUE) {
3337 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3338 .addReg(TrueReg)
3339 .addReg(FalseReg);
3340
3341 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3342 return;
3343 }
3344
3345 static const int16_t Sub0_15[] = {
3346 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3347 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3348 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3349 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3350 };
3351
3352 static const int16_t Sub0_15_64[] = {
3353 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3354 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3355 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3356 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3357 };
3358
3359 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3360 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3361 const int16_t *SubIndices = Sub0_15;
3362 int NElts = DstSize / 32;
3363
3364 // 64-bit select is only available for SALU.
3365 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3366 if (Pred == SCC_TRUE) {
3367 if (NElts % 2) {
3368 SelOp = AMDGPU::S_CSELECT_B32;
3369 EltRC = &AMDGPU::SGPR_32RegClass;
3370 } else {
3371 SelOp = AMDGPU::S_CSELECT_B64;
3372 EltRC = &AMDGPU::SGPR_64RegClass;
3373 SubIndices = Sub0_15_64;
3374 NElts /= 2;
3375 }
3376 }
3377
3379 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3380
3381 I = MIB->getIterator();
3382
3384 for (int Idx = 0; Idx != NElts; ++Idx) {
3385 Register DstElt = MRI.createVirtualRegister(EltRC);
3386 Regs.push_back(DstElt);
3387
3388 unsigned SubIdx = SubIndices[Idx];
3389
3391 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3392 Select =
3393 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3394 .addReg(FalseReg, 0, SubIdx)
3395 .addReg(TrueReg, 0, SubIdx);
3396 } else {
3397 Select =
3398 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3399 .addReg(TrueReg, 0, SubIdx)
3400 .addReg(FalseReg, 0, SubIdx);
3401 }
3402
3403 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3405
3406 MIB.addReg(DstElt)
3407 .addImm(SubIdx);
3408 }
3409}
3410
3412 switch (MI.getOpcode()) {
3413 case AMDGPU::V_MOV_B16_t16_e32:
3414 case AMDGPU::V_MOV_B16_t16_e64:
3415 case AMDGPU::V_MOV_B32_e32:
3416 case AMDGPU::V_MOV_B32_e64:
3417 case AMDGPU::V_MOV_B64_PSEUDO:
3418 case AMDGPU::V_MOV_B64_e32:
3419 case AMDGPU::V_MOV_B64_e64:
3420 case AMDGPU::S_MOV_B32:
3421 case AMDGPU::S_MOV_B64:
3422 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3423 case AMDGPU::COPY:
3424 case AMDGPU::WWM_COPY:
3425 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3426 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3427 case AMDGPU::V_ACCVGPR_MOV_B32:
3428 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3429 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3430 return true;
3431 default:
3432 return false;
3433 }
3434}
3435
3437 switch (MI.getOpcode()) {
3438 case AMDGPU::V_MOV_B16_t16_e32:
3439 case AMDGPU::V_MOV_B16_t16_e64:
3440 return 2;
3441 case AMDGPU::V_MOV_B32_e32:
3442 case AMDGPU::V_MOV_B32_e64:
3443 case AMDGPU::V_MOV_B64_PSEUDO:
3444 case AMDGPU::V_MOV_B64_e32:
3445 case AMDGPU::V_MOV_B64_e64:
3446 case AMDGPU::S_MOV_B32:
3447 case AMDGPU::S_MOV_B64:
3448 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3449 case AMDGPU::COPY:
3450 case AMDGPU::WWM_COPY:
3451 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3452 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3453 case AMDGPU::V_ACCVGPR_MOV_B32:
3454 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3455 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3456 return 1;
3457 default:
3458 llvm_unreachable("MI is not a foldable copy");
3459 }
3460}
3461
3462static constexpr AMDGPU::OpName ModifierOpNames[] = {
3463 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3464 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3465 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3466
3468 unsigned Opc = MI.getOpcode();
3469 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3470 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3471 if (Idx >= 0)
3472 MI.removeOperand(Idx);
3473 }
3474}
3475
3476std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3477 unsigned SubRegIndex) {
3478 switch (SubRegIndex) {
3479 case AMDGPU::NoSubRegister:
3480 return Imm;
3481 case AMDGPU::sub0:
3482 return SignExtend64<32>(Imm);
3483 case AMDGPU::sub1:
3484 return SignExtend64<32>(Imm >> 32);
3485 case AMDGPU::lo16:
3486 return SignExtend64<16>(Imm);
3487 case AMDGPU::hi16:
3488 return SignExtend64<16>(Imm >> 16);
3489 case AMDGPU::sub1_lo16:
3490 return SignExtend64<16>(Imm >> 32);
3491 case AMDGPU::sub1_hi16:
3492 return SignExtend64<16>(Imm >> 48);
3493 default:
3494 return std::nullopt;
3495 }
3496
3497 llvm_unreachable("covered subregister switch");
3498}
3499
3500static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3501 switch (Opc) {
3502 case AMDGPU::V_MAC_F16_e32:
3503 case AMDGPU::V_MAC_F16_e64:
3504 case AMDGPU::V_MAD_F16_e64:
3505 return AMDGPU::V_MADAK_F16;
3506 case AMDGPU::V_MAC_F32_e32:
3507 case AMDGPU::V_MAC_F32_e64:
3508 case AMDGPU::V_MAD_F32_e64:
3509 return AMDGPU::V_MADAK_F32;
3510 case AMDGPU::V_FMAC_F32_e32:
3511 case AMDGPU::V_FMAC_F32_e64:
3512 case AMDGPU::V_FMA_F32_e64:
3513 return AMDGPU::V_FMAAK_F32;
3514 case AMDGPU::V_FMAC_F16_e32:
3515 case AMDGPU::V_FMAC_F16_e64:
3516 case AMDGPU::V_FMAC_F16_t16_e64:
3517 case AMDGPU::V_FMAC_F16_fake16_e64:
3518 case AMDGPU::V_FMA_F16_e64:
3519 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3520 ? AMDGPU::V_FMAAK_F16_t16
3521 : AMDGPU::V_FMAAK_F16_fake16
3522 : AMDGPU::V_FMAAK_F16;
3523 case AMDGPU::V_FMAC_F64_e32:
3524 case AMDGPU::V_FMAC_F64_e64:
3525 case AMDGPU::V_FMA_F64_e64:
3526 return AMDGPU::V_FMAAK_F64;
3527 default:
3528 llvm_unreachable("invalid instruction");
3529 }
3530}
3531
3532static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3533 switch (Opc) {
3534 case AMDGPU::V_MAC_F16_e32:
3535 case AMDGPU::V_MAC_F16_e64:
3536 case AMDGPU::V_MAD_F16_e64:
3537 return AMDGPU::V_MADMK_F16;
3538 case AMDGPU::V_MAC_F32_e32:
3539 case AMDGPU::V_MAC_F32_e64:
3540 case AMDGPU::V_MAD_F32_e64:
3541 return AMDGPU::V_MADMK_F32;
3542 case AMDGPU::V_FMAC_F32_e32:
3543 case AMDGPU::V_FMAC_F32_e64:
3544 case AMDGPU::V_FMA_F32_e64:
3545 return AMDGPU::V_FMAMK_F32;
3546 case AMDGPU::V_FMAC_F16_e32:
3547 case AMDGPU::V_FMAC_F16_e64:
3548 case AMDGPU::V_FMAC_F16_t16_e64:
3549 case AMDGPU::V_FMAC_F16_fake16_e64:
3550 case AMDGPU::V_FMA_F16_e64:
3551 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3552 ? AMDGPU::V_FMAMK_F16_t16
3553 : AMDGPU::V_FMAMK_F16_fake16
3554 : AMDGPU::V_FMAMK_F16;
3555 case AMDGPU::V_FMAC_F64_e32:
3556 case AMDGPU::V_FMAC_F64_e64:
3557 case AMDGPU::V_FMA_F64_e64:
3558 return AMDGPU::V_FMAMK_F64;
3559 default:
3560 llvm_unreachable("invalid instruction");
3561 }
3562}
3563
3565 Register Reg, MachineRegisterInfo *MRI) const {
3566 int64_t Imm;
3567 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3568 return false;
3569
3570 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3571
3572 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3573
3574 unsigned Opc = UseMI.getOpcode();
3575 if (Opc == AMDGPU::COPY) {
3576 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3577
3578 Register DstReg = UseMI.getOperand(0).getReg();
3579 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3580
3581 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3582
3583 if (HasMultipleUses) {
3584 // TODO: This should fold in more cases with multiple use, but we need to
3585 // more carefully consider what those uses are.
3586 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3587
3588 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3589 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3590 return false;
3591
3592 // Most of the time folding a 32-bit inline constant is free (though this
3593 // might not be true if we can't later fold it into a real user).
3594 //
3595 // FIXME: This isInlineConstant check is imprecise if
3596 // getConstValDefinedInReg handled the tricky non-mov cases.
3597 if (ImmDefSize == 32 &&
3599 return false;
3600 }
3601
3602 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3603 RI.getSubRegIdxSize(UseSubReg) == 16;
3604
3605 if (Is16Bit) {
3606 if (RI.hasVGPRs(DstRC))
3607 return false; // Do not clobber vgpr_hi16
3608
3609 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3610 return false;
3611 }
3612
3613 MachineFunction *MF = UseMI.getMF();
3614
3615 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3616 MCRegister MovDstPhysReg =
3617 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3618
3619 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3620
3621 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3622 for (unsigned MovOp :
3623 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3624 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3625 const MCInstrDesc &MovDesc = get(MovOp);
3626
3627 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI);
3628 if (Is16Bit) {
3629 // We just need to find a correctly sized register class, so the
3630 // subregister index compatibility doesn't matter since we're statically
3631 // extracting the immediate value.
3632 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3633 if (!MovDstRC)
3634 continue;
3635
3636 if (MovDstPhysReg) {
3637 // FIXME: We probably should not do this. If there is a live value in
3638 // the high half of the register, it will be corrupted.
3639 MovDstPhysReg =
3640 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3641 if (!MovDstPhysReg)
3642 continue;
3643 }
3644 }
3645
3646 // Result class isn't the right size, try the next instruction.
3647 if (MovDstPhysReg) {
3648 if (!MovDstRC->contains(MovDstPhysReg))
3649 return false;
3650 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3651 // TODO: This will be overly conservative in the case of 16-bit virtual
3652 // SGPRs. We could hack up the virtual register uses to use a compatible
3653 // 32-bit class.
3654 continue;
3655 }
3656
3657 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3658
3659 // Ensure the interpreted immediate value is a valid operand in the new
3660 // mov.
3661 //
3662 // FIXME: isImmOperandLegal should have form that doesn't require existing
3663 // MachineInstr or MachineOperand
3664 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3665 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3666 break;
3667
3668 NewOpc = MovOp;
3669 break;
3670 }
3671
3672 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3673 return false;
3674
3675 if (Is16Bit) {
3676 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3677 if (MovDstPhysReg)
3678 UseMI.getOperand(0).setReg(MovDstPhysReg);
3679 assert(UseMI.getOperand(1).getReg().isVirtual());
3680 }
3681
3682 const MCInstrDesc &NewMCID = get(NewOpc);
3683 UseMI.setDesc(NewMCID);
3684 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3685 UseMI.addImplicitDefUseOperands(*MF);
3686 return true;
3687 }
3688
3689 if (HasMultipleUses)
3690 return false;
3691
3692 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3693 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3694 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3695 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3696 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3697 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3698 Opc == AMDGPU::V_FMAC_F64_e64) {
3699 // Don't fold if we are using source or output modifiers. The new VOP2
3700 // instructions don't have them.
3702 return false;
3703
3704 // If this is a free constant, there's no reason to do this.
3705 // TODO: We could fold this here instead of letting SIFoldOperands do it
3706 // later.
3707 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3708
3709 // Any src operand can be used for the legality check.
3710 if (isInlineConstant(UseMI, Src0Idx, Imm))
3711 return false;
3712
3713 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3714
3715 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3716 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3717
3718 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3719 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3720 (Src1->isReg() && Src1->getReg() == Reg)) {
3721 MachineOperand *RegSrc =
3722 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3723 if (!RegSrc->isReg())
3724 return false;
3725 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3726 ST.getConstantBusLimit(Opc) < 2)
3727 return false;
3728
3729 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3730 return false;
3731
3732 // If src2 is also a literal constant then we have to choose which one to
3733 // fold. In general it is better to choose madak so that the other literal
3734 // can be materialized in an sgpr instead of a vgpr:
3735 // s_mov_b32 s0, literal
3736 // v_madak_f32 v0, s0, v0, literal
3737 // Instead of:
3738 // v_mov_b32 v1, literal
3739 // v_madmk_f32 v0, v0, literal, v1
3740 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3741 if (Def && Def->isMoveImmediate() &&
3742 !isInlineConstant(Def->getOperand(1)))
3743 return false;
3744
3745 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3746 if (pseudoToMCOpcode(NewOpc) == -1)
3747 return false;
3748
3749 // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3750 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3751 // restricting their register classes. For now just bail out.
3752 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3753 NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3754 return false;
3755
3756 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3757 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3758
3759 // FIXME: This would be a lot easier if we could return a new instruction
3760 // instead of having to modify in place.
3761
3762 Register SrcReg = RegSrc->getReg();
3763 unsigned SrcSubReg = RegSrc->getSubReg();
3764 Src0->setReg(SrcReg);
3765 Src0->setSubReg(SrcSubReg);
3766 Src0->setIsKill(RegSrc->isKill());
3767
3768 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3769 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3770 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3771 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3772 UseMI.untieRegOperand(
3773 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3774
3775 Src1->ChangeToImmediate(*SubRegImm);
3776
3778 UseMI.setDesc(get(NewOpc));
3779
3780 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3781 if (DeleteDef)
3782 DefMI.eraseFromParent();
3783
3784 return true;
3785 }
3786
3787 // Added part is the constant: Use v_madak_{f16, f32}.
3788 if (Src2->isReg() && Src2->getReg() == Reg) {
3789 if (ST.getConstantBusLimit(Opc) < 2) {
3790 // Not allowed to use constant bus for another operand.
3791 // We can however allow an inline immediate as src0.
3792 bool Src0Inlined = false;
3793 if (Src0->isReg()) {
3794 // Try to inline constant if possible.
3795 // If the Def moves immediate and the use is single
3796 // We are saving VGPR here.
3797 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3798 if (Def && Def->isMoveImmediate() &&
3799 isInlineConstant(Def->getOperand(1)) &&
3800 MRI->hasOneNonDBGUse(Src0->getReg())) {
3801 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3802 Src0Inlined = true;
3803 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3804 RI.isSGPRReg(*MRI, Src0->getReg())) {
3805 return false;
3806 }
3807 // VGPR is okay as Src0 - fallthrough
3808 }
3809
3810 if (Src1->isReg() && !Src0Inlined) {
3811 // We have one slot for inlinable constant so far - try to fill it
3812 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3813 if (Def && Def->isMoveImmediate() &&
3814 isInlineConstant(Def->getOperand(1)) &&
3815 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3816 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3817 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3818 return false;
3819 // VGPR is okay as Src1 - fallthrough
3820 }
3821 }
3822
3823 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3824 if (pseudoToMCOpcode(NewOpc) == -1)
3825 return false;
3826
3827 // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3828 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3829 // restricting their register classes. For now just bail out.
3830 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3831 NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3832 return false;
3833
3834 // FIXME: This would be a lot easier if we could return a new instruction
3835 // instead of having to modify in place.
3836
3837 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3838 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3839 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3840 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3841 UseMI.untieRegOperand(
3842 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3843
3844 const std::optional<int64_t> SubRegImm =
3845 extractSubregFromImm(Imm, Src2->getSubReg());
3846
3847 // ChangingToImmediate adds Src2 back to the instruction.
3848 Src2->ChangeToImmediate(*SubRegImm);
3849
3850 // These come before src2.
3852 UseMI.setDesc(get(NewOpc));
3853 // It might happen that UseMI was commuted
3854 // and we now have SGPR as SRC1. If so 2 inlined
3855 // constant and SGPR are illegal.
3857
3858 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3859 if (DeleteDef)
3860 DefMI.eraseFromParent();
3861
3862 return true;
3863 }
3864 }
3865
3866 return false;
3867}
3868
3869static bool
3872 if (BaseOps1.size() != BaseOps2.size())
3873 return false;
3874 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3875 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3876 return false;
3877 }
3878 return true;
3879}
3880
3881static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3882 LocationSize WidthB, int OffsetB) {
3883 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3884 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3885 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3886 return LowWidth.hasValue() &&
3887 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3888}
3889
3890bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3891 const MachineInstr &MIb) const {
3892 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3893 int64_t Offset0, Offset1;
3894 LocationSize Dummy0 = LocationSize::precise(0);
3895 LocationSize Dummy1 = LocationSize::precise(0);
3896 bool Offset0IsScalable, Offset1IsScalable;
3897 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3898 Dummy0, &RI) ||
3899 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3900 Dummy1, &RI))
3901 return false;
3902
3903 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3904 return false;
3905
3906 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3907 // FIXME: Handle ds_read2 / ds_write2.
3908 return false;
3909 }
3910 LocationSize Width0 = MIa.memoperands().front()->getSize();
3911 LocationSize Width1 = MIb.memoperands().front()->getSize();
3912 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3913}
3914
3916 const MachineInstr &MIb) const {
3917 assert(MIa.mayLoadOrStore() &&
3918 "MIa must load from or modify a memory location");
3919 assert(MIb.mayLoadOrStore() &&
3920 "MIb must load from or modify a memory location");
3921
3923 return false;
3924
3925 // XXX - Can we relax this between address spaces?
3926 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3927 return false;
3928
3929 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3930 return false;
3931
3932 // TODO: Should we check the address space from the MachineMemOperand? That
3933 // would allow us to distinguish objects we know don't alias based on the
3934 // underlying address space, even if it was lowered to a different one,
3935 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3936 // buffer.
3937 if (isDS(MIa)) {
3938 if (isDS(MIb))
3939 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3940
3941 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3942 }
3943
3944 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3945 if (isMUBUF(MIb) || isMTBUF(MIb))
3946 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3947
3948 if (isFLAT(MIb))
3949 return isFLATScratch(MIb);
3950
3951 return !isSMRD(MIb);
3952 }
3953
3954 if (isSMRD(MIa)) {
3955 if (isSMRD(MIb))
3956 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3957
3958 if (isFLAT(MIb))
3959 return isFLATScratch(MIb);
3960
3961 return !isMUBUF(MIb) && !isMTBUF(MIb);
3962 }
3963
3964 if (isFLAT(MIa)) {
3965 if (isFLAT(MIb)) {
3966 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3967 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3968 return true;
3969
3970 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3971 }
3972
3973 return false;
3974 }
3975
3976 return false;
3977}
3978
3980 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3981 if (Reg.isPhysical())
3982 return false;
3983 auto *Def = MRI.getUniqueVRegDef(Reg);
3984 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3985 Imm = Def->getOperand(1).getImm();
3986 if (DefMI)
3987 *DefMI = Def;
3988 return true;
3989 }
3990 return false;
3991}
3992
3993static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3994 MachineInstr **DefMI = nullptr) {
3995 if (!MO->isReg())
3996 return false;
3997 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3998 const MachineRegisterInfo &MRI = MF->getRegInfo();
3999 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4000}
4001
4003 MachineInstr &NewMI) {
4004 if (LV) {
4005 unsigned NumOps = MI.getNumOperands();
4006 for (unsigned I = 1; I < NumOps; ++I) {
4007 MachineOperand &Op = MI.getOperand(I);
4008 if (Op.isReg() && Op.isKill())
4009 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4010 }
4011 }
4012}
4013
4014static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4015 switch (Opc) {
4016 case AMDGPU::V_MAC_F16_e32:
4017 case AMDGPU::V_MAC_F16_e64:
4018 return AMDGPU::V_MAD_F16_e64;
4019 case AMDGPU::V_MAC_F32_e32:
4020 case AMDGPU::V_MAC_F32_e64:
4021 return AMDGPU::V_MAD_F32_e64;
4022 case AMDGPU::V_MAC_LEGACY_F32_e32:
4023 case AMDGPU::V_MAC_LEGACY_F32_e64:
4024 return AMDGPU::V_MAD_LEGACY_F32_e64;
4025 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4026 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4027 return AMDGPU::V_FMA_LEGACY_F32_e64;
4028 case AMDGPU::V_FMAC_F16_e32:
4029 case AMDGPU::V_FMAC_F16_e64:
4030 case AMDGPU::V_FMAC_F16_t16_e64:
4031 case AMDGPU::V_FMAC_F16_fake16_e64:
4032 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4033 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4034 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4035 : AMDGPU::V_FMA_F16_gfx9_e64;
4036 case AMDGPU::V_FMAC_F32_e32:
4037 case AMDGPU::V_FMAC_F32_e64:
4038 return AMDGPU::V_FMA_F32_e64;
4039 case AMDGPU::V_FMAC_F64_e32:
4040 case AMDGPU::V_FMAC_F64_e64:
4041 return AMDGPU::V_FMA_F64_e64;
4042 default:
4043 llvm_unreachable("invalid instruction");
4044 }
4045}
4046
4048 LiveVariables *LV,
4049 LiveIntervals *LIS) const {
4050 MachineBasicBlock &MBB = *MI.getParent();
4051 unsigned Opc = MI.getOpcode();
4052
4053 // Handle MFMA.
4054 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4055 if (NewMFMAOpc != -1) {
4057 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4058 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4059 MIB.add(MI.getOperand(I));
4060 updateLiveVariables(LV, MI, *MIB);
4061 if (LIS) {
4062 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4063 // SlotIndex of defs needs to be updated when converting to early-clobber
4064 MachineOperand &Def = MIB->getOperand(0);
4065 if (Def.isEarlyClobber() && Def.isReg() &&
4066 LIS->hasInterval(Def.getReg())) {
4067 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
4068 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
4069 auto &LI = LIS->getInterval(Def.getReg());
4070 auto UpdateDefIndex = [&](LiveRange &LR) {
4071 auto *S = LR.find(OldIndex);
4072 if (S != LR.end() && S->start == OldIndex) {
4073 assert(S->valno && S->valno->def == OldIndex);
4074 S->start = NewIndex;
4075 S->valno->def = NewIndex;
4076 }
4077 };
4078 UpdateDefIndex(LI);
4079 for (auto &SR : LI.subranges())
4080 UpdateDefIndex(SR);
4081 }
4082 }
4083 return MIB;
4084 }
4085
4086 if (SIInstrInfo::isWMMA(MI)) {
4087 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4088 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4089 .setMIFlags(MI.getFlags());
4090 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4091 MIB->addOperand(MI.getOperand(I));
4092
4093 updateLiveVariables(LV, MI, *MIB);
4094 if (LIS)
4095 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4096
4097 return MIB;
4098 }
4099
4100 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4101 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4102 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4103 "present pre-RA");
4104
4105 // Handle MAC/FMAC.
4106 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4107 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4108 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4109 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4110 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4111 bool Src0Literal = false;
4112
4113 switch (Opc) {
4114 default:
4115 return nullptr;
4116 case AMDGPU::V_MAC_F16_e64:
4117 case AMDGPU::V_FMAC_F16_e64:
4118 case AMDGPU::V_FMAC_F16_t16_e64:
4119 case AMDGPU::V_FMAC_F16_fake16_e64:
4120 case AMDGPU::V_MAC_F32_e64:
4121 case AMDGPU::V_MAC_LEGACY_F32_e64:
4122 case AMDGPU::V_FMAC_F32_e64:
4123 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4124 case AMDGPU::V_FMAC_F64_e64:
4125 break;
4126 case AMDGPU::V_MAC_F16_e32:
4127 case AMDGPU::V_FMAC_F16_e32:
4128 case AMDGPU::V_MAC_F32_e32:
4129 case AMDGPU::V_MAC_LEGACY_F32_e32:
4130 case AMDGPU::V_FMAC_F32_e32:
4131 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4132 case AMDGPU::V_FMAC_F64_e32: {
4133 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4134 AMDGPU::OpName::src0);
4135 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4136 if (!Src0->isReg() && !Src0->isImm())
4137 return nullptr;
4138
4139 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4140 Src0Literal = true;
4141
4142 break;
4143 }
4144 }
4145
4147 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4148 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4149 const MachineOperand *Src0Mods =
4150 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4151 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4152 const MachineOperand *Src1Mods =
4153 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4154 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4155 const MachineOperand *Src2Mods =
4156 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4157 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4158 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4159 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4160
4161 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4162 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4163 // If we have an SGPR input, we will violate the constant bus restriction.
4164 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4165 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4167 const auto killDef = [&]() -> void {
4168 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4169 // The only user is the instruction which will be killed.
4170 Register DefReg = DefMI->getOperand(0).getReg();
4171
4172 if (MRI.hasOneNonDBGUse(DefReg)) {
4173 // We cannot just remove the DefMI here, calling pass will crash.
4174 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
4175 DefMI->getOperand(0).setIsDead(true);
4176 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
4177 DefMI->removeOperand(I);
4178 if (LV)
4179 LV->getVarInfo(DefReg).AliveBlocks.clear();
4180 }
4181
4182 if (LIS) {
4183 LiveInterval &DefLI = LIS->getInterval(DefReg);
4184
4185 // We cannot delete the original instruction here, so hack out the use
4186 // in the original instruction with a dummy register so we can use
4187 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4188 // not have the complexity of deleting a use to consider here.
4189 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4190 for (MachineOperand &MIOp : MI.uses()) {
4191 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4192 MIOp.setIsUndef(true);
4193 MIOp.setReg(DummyReg);
4194 }
4195 }
4196
4197 LIS->shrinkToUses(&DefLI);
4198 }
4199 };
4200
4201 int64_t Imm;
4202 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4203 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4204 if (pseudoToMCOpcode(NewOpc) != -1) {
4205 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4206 .add(*Dst)
4207 .add(*Src0)
4208 .add(*Src1)
4209 .addImm(Imm)
4210 .setMIFlags(MI.getFlags());
4211 updateLiveVariables(LV, MI, *MIB);
4212 if (LIS)
4213 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4214 killDef();
4215 return MIB;
4216 }
4217 }
4218 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4219 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4220 if (pseudoToMCOpcode(NewOpc) != -1) {
4221 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4222 .add(*Dst)
4223 .add(*Src0)
4224 .addImm(Imm)
4225 .add(*Src2)
4226 .setMIFlags(MI.getFlags());
4227 updateLiveVariables(LV, MI, *MIB);
4228
4229 if (LIS)
4230 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4231 killDef();
4232 return MIB;
4233 }
4234 }
4235 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4236 if (Src0Literal) {
4237 Imm = Src0->getImm();
4238 DefMI = nullptr;
4239 }
4240 if (pseudoToMCOpcode(NewOpc) != -1 &&
4242 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4243 Src1)) {
4244 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4245 .add(*Dst)
4246 .add(*Src1)
4247 .addImm(Imm)
4248 .add(*Src2)
4249 .setMIFlags(MI.getFlags());
4250 updateLiveVariables(LV, MI, *MIB);
4251
4252 if (LIS)
4253 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4254 if (DefMI)
4255 killDef();
4256 return MIB;
4257 }
4258 }
4259 }
4260
4261 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4262 // if VOP3 does not allow a literal operand.
4263 if (Src0Literal && !ST.hasVOP3Literal())
4264 return nullptr;
4265
4266 unsigned NewOpc = getNewFMAInst(ST, Opc);
4267
4268 if (pseudoToMCOpcode(NewOpc) == -1)
4269 return nullptr;
4270
4271 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4272 .add(*Dst)
4273 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4274 .add(*Src0)
4275 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4276 .add(*Src1)
4277 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4278 .add(*Src2)
4279 .addImm(Clamp ? Clamp->getImm() : 0)
4280 .addImm(Omod ? Omod->getImm() : 0)
4281 .setMIFlags(MI.getFlags());
4282 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4283 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4284 updateLiveVariables(LV, MI, *MIB);
4285 if (LIS)
4286 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4287 return MIB;
4288}
4289
4290// It's not generally safe to move VALU instructions across these since it will
4291// start using the register as a base index rather than directly.
4292// XXX - Why isn't hasSideEffects sufficient for these?
4294 switch (MI.getOpcode()) {
4295 case AMDGPU::S_SET_GPR_IDX_ON:
4296 case AMDGPU::S_SET_GPR_IDX_MODE:
4297 case AMDGPU::S_SET_GPR_IDX_OFF:
4298 return true;
4299 default:
4300 return false;
4301 }
4302}
4303
4305 const MachineBasicBlock *MBB,
4306 const MachineFunction &MF) const {
4307 // Skipping the check for SP writes in the base implementation. The reason it
4308 // was added was apparently due to compile time concerns.
4309 //
4310 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4311 // but is probably avoidable.
4312
4313 // Copied from base implementation.
4314 // Terminators and labels can't be scheduled around.
4315 if (MI.isTerminator() || MI.isPosition())
4316 return true;
4317
4318 // INLINEASM_BR can jump to another block
4319 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4320 return true;
4321
4322 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4323 return true;
4324
4325 // Target-independent instructions do not have an implicit-use of EXEC, even
4326 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4327 // boundaries prevents incorrect movements of such instructions.
4328 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4329 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4330 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4331 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4332 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4334}
4335
4337 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4338 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4339 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4340}
4341
4343 if (!isFLAT(MI) || isFLATGlobal(MI))
4344 return false;
4345
4346 // If scratch is not initialized, we can never access it.
4347 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4348 return false;
4349
4350 // SCRATCH instructions always access scratch.
4351 if (isFLATScratch(MI))
4352 return true;
4353
4354 // If there are no memory operands then conservatively assume the flat
4355 // operation may access scratch.
4356 if (MI.memoperands_empty())
4357 return true;
4358
4359 // See if any memory operand specifies an address space that involves scratch.
4360 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4361 unsigned AS = Memop->getAddrSpace();
4362 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4363 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4364 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4365 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4366 }
4367 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4368 });
4369}
4370
4372 assert(isFLAT(MI));
4373
4374 // All flat instructions use the VMEM counter except prefetch.
4375 if (!usesVM_CNT(MI))
4376 return false;
4377
4378 // If there are no memory operands then conservatively assume the flat
4379 // operation may access VMEM.
4380 if (MI.memoperands_empty())
4381 return true;
4382
4383 // See if any memory operand specifies an address space that involves VMEM.
4384 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4385 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4386 // (GDS) address space is not supported by flat operations. Therefore, simply
4387 // return true unless only the LDS address space is found.
4388 for (const MachineMemOperand *Memop : MI.memoperands()) {
4389 unsigned AS = Memop->getAddrSpace();
4391 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4392 return true;
4393 }
4394
4395 return false;
4396}
4397
4399 assert(isFLAT(MI));
4400
4401 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4402 if (!usesLGKM_CNT(MI))
4403 return false;
4404
4405 // If in tgsplit mode then there can be no use of LDS.
4406 if (ST.isTgSplitEnabled())
4407 return false;
4408
4409 // If there are no memory operands then conservatively assume the flat
4410 // operation may access LDS.
4411 if (MI.memoperands_empty())
4412 return true;
4413
4414 // See if any memory operand specifies an address space that involves LDS.
4415 for (const MachineMemOperand *Memop : MI.memoperands()) {
4416 unsigned AS = Memop->getAddrSpace();
4418 return true;
4419 }
4420
4421 return false;
4422}
4423
4425 // Skip the full operand and register alias search modifiesRegister
4426 // does. There's only a handful of instructions that touch this, it's only an
4427 // implicit def, and doesn't alias any other registers.
4428 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4429}
4430
4432 unsigned Opcode = MI.getOpcode();
4433
4434 if (MI.mayStore() && isSMRD(MI))
4435 return true; // scalar store or atomic
4436
4437 // This will terminate the function when other lanes may need to continue.
4438 if (MI.isReturn())
4439 return true;
4440
4441 // These instructions cause shader I/O that may cause hardware lockups
4442 // when executed with an empty EXEC mask.
4443 //
4444 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4445 // EXEC = 0, but checking for that case here seems not worth it
4446 // given the typical code patterns.
4447 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4448 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4449 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4450 return true;
4451
4452 if (MI.isCall() || MI.isInlineAsm())
4453 return true; // conservative assumption
4454
4455 // Assume that barrier interactions are only intended with active lanes.
4456 if (isBarrier(Opcode))
4457 return true;
4458
4459 // A mode change is a scalar operation that influences vector instructions.
4461 return true;
4462
4463 // These are like SALU instructions in terms of effects, so it's questionable
4464 // whether we should return true for those.
4465 //
4466 // However, executing them with EXEC = 0 causes them to operate on undefined
4467 // data, which we avoid by returning true here.
4468 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4469 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4470 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4471 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4472 return true;
4473
4474 return false;
4475}
4476
4478 const MachineInstr &MI) const {
4479 if (MI.isMetaInstruction())
4480 return false;
4481
4482 // This won't read exec if this is an SGPR->SGPR copy.
4483 if (MI.isCopyLike()) {
4484 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4485 return true;
4486
4487 // Make sure this isn't copying exec as a normal operand
4488 return MI.readsRegister(AMDGPU::EXEC, &RI);
4489 }
4490
4491 // Make a conservative assumption about the callee.
4492 if (MI.isCall())
4493 return true;
4494
4495 // Be conservative with any unhandled generic opcodes.
4496 if (!isTargetSpecificOpcode(MI.getOpcode()))
4497 return true;
4498
4499 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4500}
4501
4502bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4503 switch (Imm.getBitWidth()) {
4504 case 1: // This likely will be a condition code mask.
4505 return true;
4506
4507 case 32:
4508 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4509 ST.hasInv2PiInlineImm());
4510 case 64:
4511 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4512 ST.hasInv2PiInlineImm());
4513 case 16:
4514 return ST.has16BitInsts() &&
4515 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4516 ST.hasInv2PiInlineImm());
4517 default:
4518 llvm_unreachable("invalid bitwidth");
4519 }
4520}
4521
4523 APInt IntImm = Imm.bitcastToAPInt();
4524 int64_t IntImmVal = IntImm.getSExtValue();
4525 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4526 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4527 default:
4528 llvm_unreachable("invalid fltSemantics");
4531 return isInlineConstant(IntImm);
4533 return ST.has16BitInsts() &&
4534 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4536 return ST.has16BitInsts() &&
4537 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4538 }
4539}
4540
4541bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4542 // MachineOperand provides no way to tell the true operand size, since it only
4543 // records a 64-bit value. We need to know the size to determine if a 32-bit
4544 // floating point immediate bit pattern is legal for an integer immediate. It
4545 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4546 switch (OperandType) {
4556 int32_t Trunc = static_cast<int32_t>(Imm);
4557 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4558 }
4564 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4567 // We would expect inline immediates to not be concerned with an integer/fp
4568 // distinction. However, in the case of 16-bit integer operations, the
4569 // "floating point" values appear to not work. It seems read the low 16-bits
4570 // of 32-bit immediates, which happens to always work for the integer
4571 // values.
4572 //
4573 // See llvm bugzilla 46302.
4574 //
4575 // TODO: Theoretically we could use op-sel to use the high bits of the
4576 // 32-bit FP values.
4588 return false;
4591 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4592 // A few special case instructions have 16-bit operands on subtargets
4593 // where 16-bit instructions are not legal.
4594 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4595 // constants in these cases
4596 int16_t Trunc = static_cast<int16_t>(Imm);
4597 return ST.has16BitInsts() &&
4598 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4599 }
4600
4601 return false;
4602 }
4605 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4606 int16_t Trunc = static_cast<int16_t>(Imm);
4607 return ST.has16BitInsts() &&
4608 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4609 }
4610 return false;
4611 }
4615 return false;
4617 return isLegalAV64PseudoImm(Imm);
4620 // Always embedded in the instruction for free.
4621 return true;
4631 // Just ignore anything else.
4632 return true;
4633 default:
4634 llvm_unreachable("invalid operand type");
4635 }
4636}
4637
4638static bool compareMachineOp(const MachineOperand &Op0,
4639 const MachineOperand &Op1) {
4640 if (Op0.getType() != Op1.getType())
4641 return false;
4642
4643 switch (Op0.getType()) {
4645 return Op0.getReg() == Op1.getReg();
4647 return Op0.getImm() == Op1.getImm();
4648 default:
4649 llvm_unreachable("Didn't expect to be comparing these operand types");
4650 }
4651}
4652
4654 const MCOperandInfo &OpInfo) const {
4655 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4656 return true;
4657
4658 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4659 return false;
4660
4661 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4662 return true;
4663
4664 return ST.hasVOP3Literal();
4665}
4666
4667bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4668 int64_t ImmVal) const {
4669 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4670 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4671 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4672 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4673 AMDGPU::OpName::src2))
4674 return false;
4675 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4676 }
4677
4678 return isLiteralOperandLegal(InstDesc, OpInfo);
4679}
4680
4681bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4682 const MachineOperand &MO) const {
4683 if (MO.isImm())
4684 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4685
4686 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4687 "unexpected imm-like operand kind");
4688 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4689 return isLiteralOperandLegal(InstDesc, OpInfo);
4690}
4691
4693 // 2 32-bit inline constants packed into one.
4694 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4695 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4696}
4697
4698bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4699 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4700 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4701 return false;
4702
4703 int Op32 = AMDGPU::getVOPe32(Opcode);
4704 if (Op32 == -1)
4705 return false;
4706
4707 return pseudoToMCOpcode(Op32) != -1;
4708}
4709
4710bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4711 // The src0_modifier operand is present on all instructions
4712 // that have modifiers.
4713
4714 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4715}
4716
4718 AMDGPU::OpName OpName) const {
4719 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4720 return Mods && Mods->getImm();
4721}
4722
4724 return any_of(ModifierOpNames,
4725 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4726}
4727
4729 const MachineRegisterInfo &MRI) const {
4730 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4731 // Can't shrink instruction with three operands.
4732 if (Src2) {
4733 switch (MI.getOpcode()) {
4734 default: return false;
4735
4736 case AMDGPU::V_ADDC_U32_e64:
4737 case AMDGPU::V_SUBB_U32_e64:
4738 case AMDGPU::V_SUBBREV_U32_e64: {
4739 const MachineOperand *Src1
4740 = getNamedOperand(MI, AMDGPU::OpName::src1);
4741 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4742 return false;
4743 // Additional verification is needed for sdst/src2.
4744 return true;
4745 }
4746 case AMDGPU::V_MAC_F16_e64:
4747 case AMDGPU::V_MAC_F32_e64:
4748 case AMDGPU::V_MAC_LEGACY_F32_e64:
4749 case AMDGPU::V_FMAC_F16_e64:
4750 case AMDGPU::V_FMAC_F16_t16_e64:
4751 case AMDGPU::V_FMAC_F16_fake16_e64:
4752 case AMDGPU::V_FMAC_F32_e64:
4753 case AMDGPU::V_FMAC_F64_e64:
4754 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4755 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4756 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4757 return false;
4758 break;
4759
4760 case AMDGPU::V_CNDMASK_B32_e64:
4761 break;
4762 }
4763 }
4764
4765 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4766 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4767 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4768 return false;
4769
4770 // We don't need to check src0, all input types are legal, so just make sure
4771 // src0 isn't using any modifiers.
4772 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4773 return false;
4774
4775 // Can it be shrunk to a valid 32 bit opcode?
4776 if (!hasVALU32BitEncoding(MI.getOpcode()))
4777 return false;
4778
4779 // Check output modifiers
4780 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4781 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4782 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4783 // TODO: Can we avoid checking bound_ctrl/fi here?
4784 // They are only used by permlane*_swap special case.
4785 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4786 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4787}
4788
4789// Set VCC operand with all flags from \p Orig, except for setting it as
4790// implicit.
4792 const MachineOperand &Orig) {
4793
4794 for (MachineOperand &Use : MI.implicit_operands()) {
4795 if (Use.isUse() &&
4796 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4797 Use.setIsUndef(Orig.isUndef());
4798 Use.setIsKill(Orig.isKill());
4799 return;
4800 }
4801 }
4802}
4803
4805 unsigned Op32) const {
4806 MachineBasicBlock *MBB = MI.getParent();
4807
4808 const MCInstrDesc &Op32Desc = get(Op32);
4809 MachineInstrBuilder Inst32 =
4810 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4811 .setMIFlags(MI.getFlags());
4812
4813 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4814 // For VOPC instructions, this is replaced by an implicit def of vcc.
4815
4816 // We assume the defs of the shrunk opcode are in the same order, and the
4817 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4818 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4819 Inst32.add(MI.getOperand(I));
4820
4821 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4822
4823 int Idx = MI.getNumExplicitDefs();
4824 for (const MachineOperand &Use : MI.explicit_uses()) {
4825 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4827 continue;
4828
4829 if (&Use == Src2) {
4830 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4831 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4832 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4833 // of vcc was already added during the initial BuildMI, but we
4834 // 1) may need to change vcc to vcc_lo to preserve the original register
4835 // 2) have to preserve the original flags.
4836 copyFlagsToImplicitVCC(*Inst32, *Src2);
4837 continue;
4838 }
4839 }
4840
4841 Inst32.add(Use);
4842 }
4843
4844 // FIXME: Losing implicit operands
4845 fixImplicitOperands(*Inst32);
4846 return Inst32;
4847}
4848
4850 // Null is free
4851 Register Reg = RegOp.getReg();
4852 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4853 return false;
4854
4855 // SGPRs use the constant bus
4856
4857 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
4858 // physical register operands should also count, except for exec.
4859 if (RegOp.isImplicit())
4860 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
4861
4862 // SGPRs use the constant bus
4863 return AMDGPU::SReg_32RegClass.contains(Reg) ||
4864 AMDGPU::SReg_64RegClass.contains(Reg);
4865}
4866
4868 const MachineRegisterInfo &MRI) const {
4869 Register Reg = RegOp.getReg();
4870 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4871 : physRegUsesConstantBus(RegOp);
4872}
4873
4875 const MachineOperand &MO,
4876 const MCOperandInfo &OpInfo) const {
4877 // Literal constants use the constant bus.
4878 if (!MO.isReg())
4879 return !isInlineConstant(MO, OpInfo);
4880
4881 Register Reg = MO.getReg();
4882 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4884}
4885
4887 for (const MachineOperand &MO : MI.implicit_operands()) {
4888 // We only care about reads.
4889 if (MO.isDef())
4890 continue;
4891
4892 switch (MO.getReg()) {
4893 case AMDGPU::VCC:
4894 case AMDGPU::VCC_LO:
4895 case AMDGPU::VCC_HI:
4896 case AMDGPU::M0:
4897 case AMDGPU::FLAT_SCR:
4898 return MO.getReg();
4899
4900 default:
4901 break;
4902 }
4903 }
4904
4905 return Register();
4906}
4907
4908static bool shouldReadExec(const MachineInstr &MI) {
4909 if (SIInstrInfo::isVALU(MI)) {
4910 switch (MI.getOpcode()) {
4911 case AMDGPU::V_READLANE_B32:
4912 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4913 case AMDGPU::V_WRITELANE_B32:
4914 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4915 return false;
4916 }
4917
4918 return true;
4919 }
4920
4921 if (MI.isPreISelOpcode() ||
4922 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4925 return false;
4926
4927 return true;
4928}
4929
4930static bool isRegOrFI(const MachineOperand &MO) {
4931 return MO.isReg() || MO.isFI();
4932}
4933
4934static bool isSubRegOf(const SIRegisterInfo &TRI,
4935 const MachineOperand &SuperVec,
4936 const MachineOperand &SubReg) {
4937 if (SubReg.getReg().isPhysical())
4938 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4939
4940 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4941 SubReg.getReg() == SuperVec.getReg();
4942}
4943
4944// Verify the illegal copy from vector register to SGPR for generic opcode COPY
4945bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4946 const MachineRegisterInfo &MRI,
4947 StringRef &ErrInfo) const {
4948 Register DstReg = MI.getOperand(0).getReg();
4949 Register SrcReg = MI.getOperand(1).getReg();
4950 // This is a check for copy from vector register to SGPR
4951 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
4952 ErrInfo = "illegal copy from vector register to SGPR";
4953 return false;
4954 }
4955 return true;
4956}
4957
4959 StringRef &ErrInfo) const {
4960 uint16_t Opcode = MI.getOpcode();
4961 const MachineFunction *MF = MI.getParent()->getParent();
4962 const MachineRegisterInfo &MRI = MF->getRegInfo();
4963
4964 // FIXME: At this point the COPY verify is done only for non-ssa forms.
4965 // Find a better property to recognize the point where instruction selection
4966 // is just done.
4967 // We can only enforce this check after SIFixSGPRCopies pass so that the
4968 // illegal copies are legalized and thereafter we don't expect a pass
4969 // inserting similar copies.
4970 if (!MRI.isSSA() && MI.isCopy())
4971 return verifyCopy(MI, MRI, ErrInfo);
4972
4973 if (SIInstrInfo::isGenericOpcode(Opcode))
4974 return true;
4975
4976 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4977 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4978 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4979 int Src3Idx = -1;
4980 if (Src0Idx == -1) {
4981 // VOPD V_DUAL_* instructions use different operand names.
4982 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4983 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4984 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4985 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4986 }
4987
4988 // Make sure the number of operands is correct.
4989 const MCInstrDesc &Desc = get(Opcode);
4990 if (!Desc.isVariadic() &&
4991 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4992 ErrInfo = "Instruction has wrong number of operands.";
4993 return false;
4994 }
4995
4996 if (MI.isInlineAsm()) {
4997 // Verify register classes for inlineasm constraints.
4998 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4999 I != E; ++I) {
5000 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5001 if (!RC)
5002 continue;
5003
5004 const MachineOperand &Op = MI.getOperand(I);
5005 if (!Op.isReg())
5006 continue;
5007
5008 Register Reg = Op.getReg();
5009 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5010 ErrInfo = "inlineasm operand has incorrect register class.";
5011 return false;
5012 }
5013 }
5014
5015 return true;
5016 }
5017
5018 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5019 ErrInfo = "missing memory operand from image instruction.";
5020 return false;
5021 }
5022
5023 // Make sure the register classes are correct.
5024 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5025 const MachineOperand &MO = MI.getOperand(i);
5026 if (MO.isFPImm()) {
5027 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5028 "all fp values to integers.";
5029 return false;
5030 }
5031
5032 const MCOperandInfo &OpInfo = Desc.operands()[i];
5033 int16_t RegClass = getOpRegClassID(OpInfo);
5034
5035 switch (OpInfo.OperandType) {
5037 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5038 ErrInfo = "Illegal immediate value for operand.";
5039 return false;
5040 }
5041 break;
5054 break;
5056 break;
5057 break;
5071 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5072 ErrInfo = "Illegal immediate value for operand.";
5073 return false;
5074 }
5075 break;
5076 }
5078 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5079 ErrInfo = "Expected inline constant for operand.";
5080 return false;
5081 }
5082 break;
5086 break;
5091 // Check if this operand is an immediate.
5092 // FrameIndex operands will be replaced by immediates, so they are
5093 // allowed.
5094 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5095 ErrInfo = "Expected immediate, but got non-immediate";
5096 return false;
5097 }
5098 break;
5102 break;
5103 default:
5104 if (OpInfo.isGenericType())
5105 continue;
5106 break;
5107 }
5108
5109 if (!MO.isReg())
5110 continue;
5111 Register Reg = MO.getReg();
5112 if (!Reg)
5113 continue;
5114
5115 // FIXME: Ideally we would have separate instruction definitions with the
5116 // aligned register constraint.
5117 // FIXME: We do not verify inline asm operands, but custom inline asm
5118 // verification is broken anyway
5119 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
5120 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5121 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5122 if (const TargetRegisterClass *SubRC =
5123 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5124 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5125 if (RC)
5126 RC = SubRC;
5127 }
5128 }
5129
5130 // Check that this is the aligned version of the class.
5131 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5132 ErrInfo = "Subtarget requires even aligned vector registers";
5133 return false;
5134 }
5135 }
5136
5137 if (RegClass != -1) {
5138 if (Reg.isVirtual())
5139 continue;
5140
5141 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5142 if (!RC->contains(Reg)) {
5143 ErrInfo = "Operand has incorrect register class.";
5144 return false;
5145 }
5146 }
5147 }
5148
5149 // Verify SDWA
5150 if (isSDWA(MI)) {
5151 if (!ST.hasSDWA()) {
5152 ErrInfo = "SDWA is not supported on this target";
5153 return false;
5154 }
5155
5156 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5157 AMDGPU::OpName::dst_sel}) {
5158 const MachineOperand *MO = getNamedOperand(MI, Op);
5159 if (!MO)
5160 continue;
5161 int64_t Imm = MO->getImm();
5162 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5163 ErrInfo = "Invalid SDWA selection";
5164 return false;
5165 }
5166 }
5167
5168 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5169
5170 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5171 if (OpIdx == -1)
5172 continue;
5173 const MachineOperand &MO = MI.getOperand(OpIdx);
5174
5175 if (!ST.hasSDWAScalar()) {
5176 // Only VGPRS on VI
5177 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5178 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5179 return false;
5180 }
5181 } else {
5182 // No immediates on GFX9
5183 if (!MO.isReg()) {
5184 ErrInfo =
5185 "Only reg allowed as operands in SDWA instructions on GFX9+";
5186 return false;
5187 }
5188 }
5189 }
5190
5191 if (!ST.hasSDWAOmod()) {
5192 // No omod allowed on VI
5193 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5194 if (OMod != nullptr &&
5195 (!OMod->isImm() || OMod->getImm() != 0)) {
5196 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5197 return false;
5198 }
5199 }
5200
5201 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5202 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5203 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5204 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5205 const MachineOperand *Src0ModsMO =
5206 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5207 unsigned Mods = Src0ModsMO->getImm();
5208 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5209 Mods & SISrcMods::SEXT) {
5210 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5211 return false;
5212 }
5213 }
5214
5215 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5216 if (isVOPC(BasicOpcode)) {
5217 if (!ST.hasSDWASdst() && DstIdx != -1) {
5218 // Only vcc allowed as dst on VI for VOPC
5219 const MachineOperand &Dst = MI.getOperand(DstIdx);
5220 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5221 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5222 return false;
5223 }
5224 } else if (!ST.hasSDWAOutModsVOPC()) {
5225 // No clamp allowed on GFX9 for VOPC
5226 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5227 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5228 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5229 return false;
5230 }
5231
5232 // No omod allowed on GFX9 for VOPC
5233 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5234 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5235 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5236 return false;
5237 }
5238 }
5239 }
5240
5241 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5242 if (DstUnused && DstUnused->isImm() &&
5243 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5244 const MachineOperand &Dst = MI.getOperand(DstIdx);
5245 if (!Dst.isReg() || !Dst.isTied()) {
5246 ErrInfo = "Dst register should have tied register";
5247 return false;
5248 }
5249
5250 const MachineOperand &TiedMO =
5251 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5252 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5253 ErrInfo =
5254 "Dst register should be tied to implicit use of preserved register";
5255 return false;
5256 }
5257 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5258 ErrInfo = "Dst register should use same physical register as preserved";
5259 return false;
5260 }
5261 }
5262 }
5263
5264 // Verify MIMG / VIMAGE / VSAMPLE
5265 if (isImage(Opcode) && !MI.mayStore()) {
5266 // Ensure that the return type used is large enough for all the options
5267 // being used TFE/LWE require an extra result register.
5268 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5269 if (DMask) {
5270 uint64_t DMaskImm = DMask->getImm();
5271 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5272 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5273 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5274 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5275
5276 // Adjust for packed 16 bit values
5277 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5278 RegCount = divideCeil(RegCount, 2);
5279
5280 // Adjust if using LWE or TFE
5281 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5282 RegCount += 1;
5283
5284 const uint32_t DstIdx =
5285 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5286 const MachineOperand &Dst = MI.getOperand(DstIdx);
5287 if (Dst.isReg()) {
5288 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5289 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5290 if (RegCount > DstSize) {
5291 ErrInfo = "Image instruction returns too many registers for dst "
5292 "register class";
5293 return false;
5294 }
5295 }
5296 }
5297 }
5298
5299 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5300 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5301 unsigned ConstantBusCount = 0;
5302 bool UsesLiteral = false;
5303 const MachineOperand *LiteralVal = nullptr;
5304
5305 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5306 if (ImmIdx != -1) {
5307 ++ConstantBusCount;
5308 UsesLiteral = true;
5309 LiteralVal = &MI.getOperand(ImmIdx);
5310 }
5311
5312 SmallVector<Register, 2> SGPRsUsed;
5313 Register SGPRUsed;
5314
5315 // Only look at the true operands. Only a real operand can use the constant
5316 // bus, and we don't want to check pseudo-operands like the source modifier
5317 // flags.
5318 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5319 if (OpIdx == -1)
5320 continue;
5321 const MachineOperand &MO = MI.getOperand(OpIdx);
5322 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5323 if (MO.isReg()) {
5324 SGPRUsed = MO.getReg();
5325 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5326 ++ConstantBusCount;
5327 SGPRsUsed.push_back(SGPRUsed);
5328 }
5329 } else if (!MO.isFI()) { // Treat FI like a register.
5330 if (!UsesLiteral) {
5331 ++ConstantBusCount;
5332 UsesLiteral = true;
5333 LiteralVal = &MO;
5334 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5335 assert(isVOP2(MI) || isVOP3(MI));
5336 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5337 return false;
5338 }
5339 }
5340 }
5341 }
5342
5343 SGPRUsed = findImplicitSGPRRead(MI);
5344 if (SGPRUsed) {
5345 // Implicit uses may safely overlap true operands
5346 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5347 return !RI.regsOverlap(SGPRUsed, SGPR);
5348 })) {
5349 ++ConstantBusCount;
5350 SGPRsUsed.push_back(SGPRUsed);
5351 }
5352 }
5353
5354 // v_writelane_b32 is an exception from constant bus restriction:
5355 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5356 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5357 Opcode != AMDGPU::V_WRITELANE_B32) {
5358 ErrInfo = "VOP* instruction violates constant bus restriction";
5359 return false;
5360 }
5361
5362 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5363 ErrInfo = "VOP3 instruction uses literal";
5364 return false;
5365 }
5366 }
5367
5368 // Special case for writelane - this can break the multiple constant bus rule,
5369 // but still can't use more than one SGPR register
5370 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5371 unsigned SGPRCount = 0;
5372 Register SGPRUsed;
5373
5374 for (int OpIdx : {Src0Idx, Src1Idx}) {
5375 if (OpIdx == -1)
5376 break;
5377
5378 const MachineOperand &MO = MI.getOperand(OpIdx);
5379
5380 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5381 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5382 if (MO.getReg() != SGPRUsed)
5383 ++SGPRCount;
5384 SGPRUsed = MO.getReg();
5385 }
5386 }
5387 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5388 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5389 return false;
5390 }
5391 }
5392 }
5393
5394 // Verify misc. restrictions on specific instructions.
5395 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5396 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5397 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5398 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5399 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5400 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5401 if (!compareMachineOp(Src0, Src1) &&
5402 !compareMachineOp(Src0, Src2)) {
5403 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5404 return false;
5405 }
5406 }
5407 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5408 SISrcMods::ABS) ||
5409 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5410 SISrcMods::ABS) ||
5411 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5412 SISrcMods::ABS)) {
5413 ErrInfo = "ABS not allowed in VOP3B instructions";
5414 return false;
5415 }
5416 }
5417
5418 if (isSOP2(MI) || isSOPC(MI)) {
5419 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5420 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5421
5422 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5423 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5424 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5425 !Src0.isIdenticalTo(Src1)) {
5426 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5427 return false;
5428 }
5429 }
5430
5431 if (isSOPK(MI)) {
5432 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5433 if (Desc.isBranch()) {
5434 if (!Op->isMBB()) {
5435 ErrInfo = "invalid branch target for SOPK instruction";
5436 return false;
5437 }
5438 } else {
5439 uint64_t Imm = Op->getImm();
5440 if (sopkIsZext(Opcode)) {
5441 if (!isUInt<16>(Imm)) {
5442 ErrInfo = "invalid immediate for SOPK instruction";
5443 return false;
5444 }
5445 } else {
5446 if (!isInt<16>(Imm)) {
5447 ErrInfo = "invalid immediate for SOPK instruction";
5448 return false;
5449 }
5450 }
5451 }
5452 }
5453
5454 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5455 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5456 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5457 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5458 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5459 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5460
5461 const unsigned StaticNumOps =
5462 Desc.getNumOperands() + Desc.implicit_uses().size();
5463 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5464
5465 // Allow additional implicit operands. This allows a fixup done by the post
5466 // RA scheduler where the main implicit operand is killed and implicit-defs
5467 // are added for sub-registers that remain live after this instruction.
5468 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5469 ErrInfo = "missing implicit register operands";
5470 return false;
5471 }
5472
5473 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5474 if (IsDst) {
5475 if (!Dst->isUse()) {
5476 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5477 return false;
5478 }
5479
5480 unsigned UseOpIdx;
5481 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5482 UseOpIdx != StaticNumOps + 1) {
5483 ErrInfo = "movrel implicit operands should be tied";
5484 return false;
5485 }
5486 }
5487
5488 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5489 const MachineOperand &ImpUse
5490 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5491 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5492 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5493 ErrInfo = "src0 should be subreg of implicit vector use";
5494 return false;
5495 }
5496 }
5497
5498 // Make sure we aren't losing exec uses in the td files. This mostly requires
5499 // being careful when using let Uses to try to add other use registers.
5500 if (shouldReadExec(MI)) {
5501 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5502 ErrInfo = "VALU instruction does not implicitly read exec mask";
5503 return false;
5504 }
5505 }
5506
5507 if (isSMRD(MI)) {
5508 if (MI.mayStore() &&
5509 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5510 // The register offset form of scalar stores may only use m0 as the
5511 // soffset register.
5512 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5513 if (Soff && Soff->getReg() != AMDGPU::M0) {
5514 ErrInfo = "scalar stores must use m0 as offset register";
5515 return false;
5516 }
5517 }
5518 }
5519
5520 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5521 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5522 if (Offset->getImm() != 0) {
5523 ErrInfo = "subtarget does not support offsets in flat instructions";
5524 return false;
5525 }
5526 }
5527
5528 if (isDS(MI) && !ST.hasGDS()) {
5529 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5530 if (GDSOp && GDSOp->getImm() != 0) {
5531 ErrInfo = "GDS is not supported on this subtarget";
5532 return false;
5533 }
5534 }
5535
5536 if (isImage(MI)) {
5537 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5538 if (DimOp) {
5539 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5540 AMDGPU::OpName::vaddr0);
5541 AMDGPU::OpName RSrcOpName =
5542 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5543 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5544 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5545 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5546 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5547 const AMDGPU::MIMGDimInfo *Dim =
5549
5550 if (!Dim) {
5551 ErrInfo = "dim is out of range";
5552 return false;
5553 }
5554
5555 bool IsA16 = false;
5556 if (ST.hasR128A16()) {
5557 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5558 IsA16 = R128A16->getImm() != 0;
5559 } else if (ST.hasA16()) {
5560 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5561 IsA16 = A16->getImm() != 0;
5562 }
5563
5564 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5565
5566 unsigned AddrWords =
5567 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5568
5569 unsigned VAddrWords;
5570 if (IsNSA) {
5571 VAddrWords = RsrcIdx - VAddr0Idx;
5572 if (ST.hasPartialNSAEncoding() &&
5573 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5574 unsigned LastVAddrIdx = RsrcIdx - 1;
5575 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5576 }
5577 } else {
5578 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5579 if (AddrWords > 12)
5580 AddrWords = 16;
5581 }
5582
5583 if (VAddrWords != AddrWords) {
5584 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5585 << " but got " << VAddrWords << "\n");
5586 ErrInfo = "bad vaddr size";
5587 return false;
5588 }
5589 }
5590 }
5591
5592 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5593 if (DppCt) {
5594 using namespace AMDGPU::DPP;
5595
5596 unsigned DC = DppCt->getImm();
5597 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5598 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5599 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5600 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5601 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5602 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5603 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5604 ErrInfo = "Invalid dpp_ctrl value";
5605 return false;
5606 }
5607 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5608 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5609 ErrInfo = "Invalid dpp_ctrl value: "
5610 "wavefront shifts are not supported on GFX10+";
5611 return false;
5612 }
5613 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5614 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5615 ErrInfo = "Invalid dpp_ctrl value: "
5616 "broadcasts are not supported on GFX10+";
5617 return false;
5618 }
5619 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5620 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5621 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5622 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5623 !ST.hasGFX90AInsts()) {
5624 ErrInfo = "Invalid dpp_ctrl value: "
5625 "row_newbroadcast/row_share is not supported before "
5626 "GFX90A/GFX10";
5627 return false;
5628 }
5629 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5630 ErrInfo = "Invalid dpp_ctrl value: "
5631 "row_share and row_xmask are not supported before GFX10";
5632 return false;
5633 }
5634 }
5635
5636 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5638 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5639 ErrInfo = "Invalid dpp_ctrl value: "
5640 "DP ALU dpp only support row_newbcast";
5641 return false;
5642 }
5643 }
5644
5645 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5646 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5647 AMDGPU::OpName DataName =
5648 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5649 const MachineOperand *Data = getNamedOperand(MI, DataName);
5650 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5651 if (Data && !Data->isReg())
5652 Data = nullptr;
5653
5654 if (ST.hasGFX90AInsts()) {
5655 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5656 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5657 ErrInfo = "Invalid register class: "
5658 "vdata and vdst should be both VGPR or AGPR";
5659 return false;
5660 }
5661 if (Data && Data2 &&
5662 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5663 ErrInfo = "Invalid register class: "
5664 "both data operands should be VGPR or AGPR";
5665 return false;
5666 }
5667 } else {
5668 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5669 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5670 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5671 ErrInfo = "Invalid register class: "
5672 "agpr loads and stores not supported on this GPU";
5673 return false;
5674 }
5675 }
5676 }
5677
5678 if (ST.needsAlignedVGPRs()) {
5679 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5681 if (!Op)
5682 return true;
5683 Register Reg = Op->getReg();
5684 if (Reg.isPhysical())
5685 return !(RI.getHWRegIndex(Reg) & 1);
5686 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5687 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5688 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5689 };
5690
5691 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5692 Opcode == AMDGPU::DS_GWS_BARRIER) {
5693
5694 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5695 ErrInfo = "Subtarget requires even aligned vector registers "
5696 "for DS_GWS instructions";
5697 return false;
5698 }
5699 }
5700
5701 if (isMIMG(MI)) {
5702 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5703 ErrInfo = "Subtarget requires even aligned vector registers "
5704 "for vaddr operand of image instructions";
5705 return false;
5706 }
5707 }
5708 }
5709
5710 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5711 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5712 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5713 ErrInfo = "Invalid register class: "
5714 "v_accvgpr_write with an SGPR is not supported on this GPU";
5715 return false;
5716 }
5717 }
5718
5719 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5720 const MachineOperand &SrcOp = MI.getOperand(1);
5721 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5722 ErrInfo = "pseudo expects only physical SGPRs";
5723 return false;
5724 }
5725 }
5726
5727 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5728 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5729 if (!ST.hasScaleOffset()) {
5730 ErrInfo = "Subtarget does not support offset scaling";
5731 return false;
5732 }
5733 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5734 ErrInfo = "Instruction does not support offset scaling";
5735 return false;
5736 }
5737 }
5738 }
5739
5740 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5741 // information.
5742 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5743 for (unsigned I = 0; I < 3; ++I) {
5745 return false;
5746 }
5747 }
5748
5749 return true;
5750}
5751
5752// It is more readable to list mapped opcodes on the same line.
5753// clang-format off
5754
5756 switch (MI.getOpcode()) {
5757 default: return AMDGPU::INSTRUCTION_LIST_END;
5758 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5759 case AMDGPU::COPY: return AMDGPU::COPY;
5760 case AMDGPU::PHI: return AMDGPU::PHI;
5761 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5762 case AMDGPU::WQM: return AMDGPU::WQM;
5763 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5764 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5765 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5766 case AMDGPU::S_MOV_B32: {
5767 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5768 return MI.getOperand(1).isReg() ||
5769 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5770 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5771 }
5772 case AMDGPU::S_ADD_I32:
5773 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5774 case AMDGPU::S_ADDC_U32:
5775 return AMDGPU::V_ADDC_U32_e32;
5776 case AMDGPU::S_SUB_I32:
5777 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5778 // FIXME: These are not consistently handled, and selected when the carry is
5779 // used.
5780 case AMDGPU::S_ADD_U32:
5781 return AMDGPU::V_ADD_CO_U32_e32;
5782 case AMDGPU::S_SUB_U32:
5783 return AMDGPU::V_SUB_CO_U32_e32;
5784 case AMDGPU::S_ADD_U64_PSEUDO:
5785 return AMDGPU::V_ADD_U64_PSEUDO;
5786 case AMDGPU::S_SUB_U64_PSEUDO:
5787 return AMDGPU::V_SUB_U64_PSEUDO;
5788 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5789 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5790 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5791 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5792 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5793 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5794 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5795 case AMDGPU::S_XNOR_B32:
5796 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5797 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5798 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5799 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5800 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5801 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5802 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5803 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5804 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5805 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5806 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5807 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5808 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5809 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5810 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5811 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5812 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5813 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5814 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5815 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5816 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5817 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5818 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5819 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5820 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5821 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5822 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5823 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5824 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5825 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5826 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5827 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5828 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5829 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5830 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5831 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5832 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5833 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5834 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5835 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5836 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5837 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5838 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5839 case AMDGPU::S_CVT_F32_F16:
5840 case AMDGPU::S_CVT_HI_F32_F16:
5841 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5842 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5843 case AMDGPU::S_CVT_F16_F32:
5844 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5845 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5846 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5847 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5848 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5849 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5850 case AMDGPU::S_CEIL_F16:
5851 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5852 : AMDGPU::V_CEIL_F16_fake16_e64;
5853 case AMDGPU::S_FLOOR_F16:
5854 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5855 : AMDGPU::V_FLOOR_F16_fake16_e64;
5856 case AMDGPU::S_TRUNC_F16:
5857 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
5858 : AMDGPU::V_TRUNC_F16_fake16_e64;
5859 case AMDGPU::S_RNDNE_F16:
5860 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
5861 : AMDGPU::V_RNDNE_F16_fake16_e64;
5862 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5863 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5864 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5865 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5866 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5867 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5868 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5869 case AMDGPU::S_ADD_F16:
5870 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
5871 : AMDGPU::V_ADD_F16_fake16_e64;
5872 case AMDGPU::S_SUB_F16:
5873 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
5874 : AMDGPU::V_SUB_F16_fake16_e64;
5875 case AMDGPU::S_MIN_F16:
5876 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
5877 : AMDGPU::V_MIN_F16_fake16_e64;
5878 case AMDGPU::S_MAX_F16:
5879 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
5880 : AMDGPU::V_MAX_F16_fake16_e64;
5881 case AMDGPU::S_MINIMUM_F16:
5882 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5883 : AMDGPU::V_MINIMUM_F16_fake16_e64;
5884 case AMDGPU::S_MAXIMUM_F16:
5885 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5886 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
5887 case AMDGPU::S_MUL_F16:
5888 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
5889 : AMDGPU::V_MUL_F16_fake16_e64;
5890 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5891 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5892 case AMDGPU::S_FMAC_F16:
5893 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5894 : AMDGPU::V_FMAC_F16_fake16_e64;
5895 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5896 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5897 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5898 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5899 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5900 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5901 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5902 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5903 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5904 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5905 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5906 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5907 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5908 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5909 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5910 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5911 case AMDGPU::S_CMP_LT_F16:
5912 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5913 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5914 case AMDGPU::S_CMP_EQ_F16:
5915 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5916 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5917 case AMDGPU::S_CMP_LE_F16:
5918 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5919 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5920 case AMDGPU::S_CMP_GT_F16:
5921 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5922 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5923 case AMDGPU::S_CMP_LG_F16:
5924 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5925 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5926 case AMDGPU::S_CMP_GE_F16:
5927 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5928 : AMDGPU::V_CMP_GE_F16_fake16_e64;
5929 case AMDGPU::S_CMP_O_F16:
5930 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5931 : AMDGPU::V_CMP_O_F16_fake16_e64;
5932 case AMDGPU::S_CMP_U_F16:
5933 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5934 : AMDGPU::V_CMP_U_F16_fake16_e64;
5935 case AMDGPU::S_CMP_NGE_F16:
5936 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5937 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5938 case AMDGPU::S_CMP_NLG_F16:
5939 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5940 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5941 case AMDGPU::S_CMP_NGT_F16:
5942 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5943 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
5944 case AMDGPU::S_CMP_NLE_F16:
5945 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5946 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
5947 case AMDGPU::S_CMP_NEQ_F16:
5948 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5949 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
5950 case AMDGPU::S_CMP_NLT_F16:
5951 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5952 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
5953 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5954 case AMDGPU::V_S_EXP_F16_e64:
5955 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
5956 : AMDGPU::V_EXP_F16_fake16_e64;
5957 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5958 case AMDGPU::V_S_LOG_F16_e64:
5959 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
5960 : AMDGPU::V_LOG_F16_fake16_e64;
5961 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5962 case AMDGPU::V_S_RCP_F16_e64:
5963 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
5964 : AMDGPU::V_RCP_F16_fake16_e64;
5965 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5966 case AMDGPU::V_S_RSQ_F16_e64:
5967 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
5968 : AMDGPU::V_RSQ_F16_fake16_e64;
5969 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5970 case AMDGPU::V_S_SQRT_F16_e64:
5971 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
5972 : AMDGPU::V_SQRT_F16_fake16_e64;
5973 }
5975 "Unexpected scalar opcode without corresponding vector one!");
5976}
5977
5978// clang-format on
5979
5983 const DebugLoc &DL, Register Reg,
5984 bool IsSCCLive,
5985 SlotIndexes *Indexes) const {
5986 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5987 const SIInstrInfo *TII = ST.getInstrInfo();
5989 if (IsSCCLive) {
5990 // Insert two move instructions, one to save the original value of EXEC and
5991 // the other to turn on all bits in EXEC. This is required as we can't use
5992 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5993 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
5995 auto FlipExecMI =
5996 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
5997 if (Indexes) {
5998 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5999 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6000 }
6001 } else {
6002 auto SaveExec =
6003 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6004 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6005 if (Indexes)
6006 Indexes->insertMachineInstrInMaps(*SaveExec);
6007 }
6008}
6009
6012 const DebugLoc &DL, Register Reg,
6013 SlotIndexes *Indexes) const {
6015 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6016 .addReg(Reg, RegState::Kill);
6017 if (Indexes)
6018 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6019}
6020
6024 "Not a whole wave func");
6025 MachineBasicBlock &MBB = *MF.begin();
6026 for (MachineInstr &MI : MBB)
6027 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6028 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6029 return &MI;
6030
6031 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6032}
6033
6034// FIXME: This should not be an overridable function. All subtarget dependent
6035// operand modifications should go through isLookupRegClassByHwMode in the
6036// generic handling.
6037const TargetRegisterClass *
6038SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum,
6039 const TargetRegisterInfo *TRI) const {
6040 if (OpNum >= TID.getNumOperands())
6041 return nullptr;
6042 const MCOperandInfo &OpInfo = TID.operands()[OpNum];
6043 int16_t RegClass = getOpRegClassID(OpInfo);
6044 return RI.getRegClass(RegClass);
6045}
6046
6048 unsigned OpNo) const {
6049 const MCInstrDesc &Desc = get(MI.getOpcode());
6050 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6051 Desc.operands()[OpNo].RegClass == -1) {
6052 Register Reg = MI.getOperand(OpNo).getReg();
6053
6054 if (Reg.isVirtual()) {
6055 const MachineRegisterInfo &MRI =
6056 MI.getParent()->getParent()->getRegInfo();
6057 return MRI.getRegClass(Reg);
6058 }
6059 return RI.getPhysRegBaseClass(Reg);
6060 }
6061
6062 return RI.getRegClass(getOpRegClassID(Desc.operands()[OpNo]));
6063}
6064
6067 MachineBasicBlock *MBB = MI.getParent();
6068 MachineOperand &MO = MI.getOperand(OpIdx);
6069 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6070 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6071 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6072 unsigned Size = RI.getRegSizeInBits(*RC);
6073 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6074 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6075 : AMDGPU::V_MOV_B32_e32;
6076 if (MO.isReg())
6077 Opcode = AMDGPU::COPY;
6078 else if (RI.isSGPRClass(RC))
6079 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6080
6081 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6082 Register Reg = MRI.createVirtualRegister(VRC);
6083 DebugLoc DL = MBB->findDebugLoc(I);
6084 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6085 MO.ChangeToRegister(Reg, false);
6086}
6087
6090 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6091 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6092 if (!SuperReg.getReg().isVirtual())
6093 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6094
6095 MachineBasicBlock *MBB = MI->getParent();
6096 const DebugLoc &DL = MI->getDebugLoc();
6097 Register SubReg = MRI.createVirtualRegister(SubRC);
6098
6099 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6100 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6101 .addReg(SuperReg.getReg(), 0, NewSubIdx);
6102 return SubReg;
6103}
6104
6107 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6108 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6109 if (Op.isImm()) {
6110 if (SubIdx == AMDGPU::sub0)
6111 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6112 if (SubIdx == AMDGPU::sub1)
6113 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6114
6115 llvm_unreachable("Unhandled register index for immediate");
6116 }
6117
6118 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6119 SubIdx, SubRC);
6120 return MachineOperand::CreateReg(SubReg, false);
6121}
6122
6123// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6124void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6125 assert(Inst.getNumExplicitOperands() == 3);
6126 MachineOperand Op1 = Inst.getOperand(1);
6127 Inst.removeOperand(1);
6128 Inst.addOperand(Op1);
6129}
6130
6132 const MCOperandInfo &OpInfo,
6133 const MachineOperand &MO) const {
6134 if (!MO.isReg())
6135 return false;
6136
6137 Register Reg = MO.getReg();
6138
6139 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6140 if (Reg.isPhysical())
6141 return DRC->contains(Reg);
6142
6143 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6144
6145 if (MO.getSubReg()) {
6146 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
6147 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6148 if (!SuperRC)
6149 return false;
6150 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6151 }
6152
6153 return RI.getCommonSubClass(DRC, RC) != nullptr;
6154}
6155
6157 const MachineOperand &MO) const {
6158 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6159 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6160 unsigned Opc = MI.getOpcode();
6161
6162 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6163 // information.
6164 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6165 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6166 constexpr const AMDGPU::OpName OpNames[] = {
6167 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6168
6169 for (auto [I, OpName] : enumerate(OpNames)) {
6170 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6171 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6173 return false;
6174 }
6175 }
6176
6177 if (!isLegalRegOperand(MRI, OpInfo, MO))
6178 return false;
6179
6180 // check Accumulate GPR operand
6181 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6182 if (IsAGPR && !ST.hasMAIInsts())
6183 return false;
6184 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6185 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6186 return false;
6187 // Atomics should have both vdst and vdata either vgpr or agpr.
6188 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6189 const int DataIdx = AMDGPU::getNamedOperandIdx(
6190 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6191 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6192 MI.getOperand(DataIdx).isReg() &&
6193 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6194 return false;
6195 if ((int)OpIdx == DataIdx) {
6196 if (VDstIdx != -1 &&
6197 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6198 return false;
6199 // DS instructions with 2 src operands also must have tied RC.
6200 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6201 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6202 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6203 return false;
6204 }
6205
6206 // Check V_ACCVGPR_WRITE_B32_e64
6207 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6208 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6209 RI.isSGPRReg(MRI, MO.getReg()))
6210 return false;
6211 return true;
6212}
6213
6215 const MCOperandInfo &OpInfo,
6216 const MachineOperand &MO) const {
6217 if (MO.isReg())
6218 return isLegalRegOperand(MRI, OpInfo, MO);
6219
6220 // Handle non-register types that are treated like immediates.
6221 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6222 return true;
6223}
6224
6226 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6227 const MachineOperand *MO) const {
6228 constexpr const unsigned NumOps = 3;
6229 constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
6230 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6231 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6232 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6233
6234 assert(SrcN < NumOps);
6235
6236 if (!MO) {
6237 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6238 if (SrcIdx == -1)
6239 return true;
6240 MO = &MI.getOperand(SrcIdx);
6241 }
6242
6243 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6244 return true;
6245
6246 int ModsIdx =
6247 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6248 if (ModsIdx == -1)
6249 return true;
6250
6251 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6252 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6253 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6254
6255 return !OpSel && !OpSelHi;
6256}
6257
6259 const MachineOperand *MO) const {
6260 const MachineFunction &MF = *MI.getParent()->getParent();
6261 const MachineRegisterInfo &MRI = MF.getRegInfo();
6262 const MCInstrDesc &InstDesc = MI.getDesc();
6263 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6264 int64_t RegClass = getOpRegClassID(OpInfo);
6265 const TargetRegisterClass *DefinedRC =
6266 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6267 if (!MO)
6268 MO = &MI.getOperand(OpIdx);
6269
6270 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6271
6272 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6273 const MachineOperand *UsedLiteral = nullptr;
6274
6275 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6276 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6277
6278 // TODO: Be more permissive with frame indexes.
6279 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6280 if (!LiteralLimit--)
6281 return false;
6282
6283 UsedLiteral = MO;
6284 }
6285
6287 if (MO->isReg())
6288 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6289
6290 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6291 if (i == OpIdx)
6292 continue;
6293 const MachineOperand &Op = MI.getOperand(i);
6294 if (Op.isReg()) {
6295 if (Op.isUse()) {
6296 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6297 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6298 if (--ConstantBusLimit <= 0)
6299 return false;
6300 }
6301 }
6302 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6303 !isInlineConstant(Op, InstDesc.operands()[i])) {
6304 // The same literal may be used multiple times.
6305 if (!UsedLiteral)
6306 UsedLiteral = &Op;
6307 else if (UsedLiteral->isIdenticalTo(Op))
6308 continue;
6309
6310 if (!LiteralLimit--)
6311 return false;
6312 if (--ConstantBusLimit <= 0)
6313 return false;
6314 }
6315 }
6316 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6317 // There can be at most one literal operand, but it can be repeated.
6318 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6319 if (i == OpIdx)
6320 continue;
6321 const MachineOperand &Op = MI.getOperand(i);
6322 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6323 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6324 !Op.isIdenticalTo(*MO))
6325 return false;
6326
6327 // Do not fold a non-inlineable and non-register operand into an
6328 // instruction that already has a frame index. The frame index handling
6329 // code could not handle well when a frame index co-exists with another
6330 // non-register operand, unless that operand is an inlineable immediate.
6331 if (Op.isFI())
6332 return false;
6333 }
6334 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6335 isF16PseudoScalarTrans(MI.getOpcode())) {
6336 return false;
6337 }
6338
6339 if (MO->isReg()) {
6340 if (!DefinedRC)
6341 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6342 return isLegalRegOperand(MI, OpIdx, *MO);
6343 }
6344
6345 if (MO->isImm()) {
6346 uint64_t Imm = MO->getImm();
6347 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6348 bool Is64BitOp = Is64BitFPOp ||
6349 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6350 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6351 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6352 if (Is64BitOp &&
6353 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6354 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6355 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6356 return false;
6357
6358 // FIXME: We can use sign extended 64-bit literals, but only for signed
6359 // operands. At the moment we do not know if an operand is signed.
6360 // Such operand will be encoded as its low 32 bits and then either
6361 // correctly sign extended or incorrectly zero extended by HW.
6362 // If 64-bit literals are supported and the literal will be encoded
6363 // as full 64 bit we still can use it.
6364 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6365 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6366 return false;
6367 }
6368 }
6369
6370 // Handle non-register types that are treated like immediates.
6371 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6372
6373 if (!DefinedRC) {
6374 // This operand expects an immediate.
6375 return true;
6376 }
6377
6378 return isImmOperandLegal(MI, OpIdx, *MO);
6379}
6380
6382 bool IsGFX950Only = ST.hasGFX950Insts();
6383 bool IsGFX940Only = ST.hasGFX940Insts();
6384
6385 if (!IsGFX950Only && !IsGFX940Only)
6386 return false;
6387
6388 if (!isVALU(MI))
6389 return false;
6390
6391 // V_COS, V_EXP, V_RCP, etc.
6392 if (isTRANS(MI))
6393 return true;
6394
6395 // DOT2, DOT2C, DOT4, etc.
6396 if (isDOT(MI))
6397 return true;
6398
6399 // MFMA, SMFMA
6400 if (isMFMA(MI))
6401 return true;
6402
6403 unsigned Opcode = MI.getOpcode();
6404 switch (Opcode) {
6405 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6406 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6407 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6408 case AMDGPU::V_MQSAD_U32_U8_e64:
6409 case AMDGPU::V_PK_ADD_F16:
6410 case AMDGPU::V_PK_ADD_F32:
6411 case AMDGPU::V_PK_ADD_I16:
6412 case AMDGPU::V_PK_ADD_U16:
6413 case AMDGPU::V_PK_ASHRREV_I16:
6414 case AMDGPU::V_PK_FMA_F16:
6415 case AMDGPU::V_PK_FMA_F32:
6416 case AMDGPU::V_PK_FMAC_F16_e32:
6417 case AMDGPU::V_PK_FMAC_F16_e64:
6418 case AMDGPU::V_PK_LSHLREV_B16:
6419 case AMDGPU::V_PK_LSHRREV_B16:
6420 case AMDGPU::V_PK_MAD_I16:
6421 case AMDGPU::V_PK_MAD_U16:
6422 case AMDGPU::V_PK_MAX_F16:
6423 case AMDGPU::V_PK_MAX_I16:
6424 case AMDGPU::V_PK_MAX_U16:
6425 case AMDGPU::V_PK_MIN_F16:
6426 case AMDGPU::V_PK_MIN_I16:
6427 case AMDGPU::V_PK_MIN_U16:
6428 case AMDGPU::V_PK_MOV_B32:
6429 case AMDGPU::V_PK_MUL_F16:
6430 case AMDGPU::V_PK_MUL_F32:
6431 case AMDGPU::V_PK_MUL_LO_U16:
6432 case AMDGPU::V_PK_SUB_I16:
6433 case AMDGPU::V_PK_SUB_U16:
6434 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6435 return true;
6436 default:
6437 return false;
6438 }
6439}
6440
6442 MachineInstr &MI) const {
6443 unsigned Opc = MI.getOpcode();
6444 const MCInstrDesc &InstrDesc = get(Opc);
6445
6446 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6447 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6448
6449 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6450 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6451
6452 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6453 // we need to only have one constant bus use before GFX10.
6454 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6455 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6456 RI.isSGPRReg(MRI, Src0.getReg()))
6457 legalizeOpWithMove(MI, Src0Idx);
6458
6459 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6460 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6461 // src0/src1 with V_READFIRSTLANE.
6462 if (Opc == AMDGPU::V_WRITELANE_B32) {
6463 const DebugLoc &DL = MI.getDebugLoc();
6464 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6465 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6466 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6467 .add(Src0);
6468 Src0.ChangeToRegister(Reg, false);
6469 }
6470 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6471 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6472 const DebugLoc &DL = MI.getDebugLoc();
6473 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6474 .add(Src1);
6475 Src1.ChangeToRegister(Reg, false);
6476 }
6477 return;
6478 }
6479
6480 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6481 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6482 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6483 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6484 legalizeOpWithMove(MI, Src2Idx);
6485 }
6486
6487 // VOP2 src0 instructions support all operand types, so we don't need to check
6488 // their legality. If src1 is already legal, we don't need to do anything.
6489 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6490 return;
6491
6492 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6493 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6494 // select is uniform.
6495 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6496 RI.isVGPR(MRI, Src1.getReg())) {
6497 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6498 const DebugLoc &DL = MI.getDebugLoc();
6499 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6500 .add(Src1);
6501 Src1.ChangeToRegister(Reg, false);
6502 return;
6503 }
6504
6505 // We do not use commuteInstruction here because it is too aggressive and will
6506 // commute if it is possible. We only want to commute here if it improves
6507 // legality. This can be called a fairly large number of times so don't waste
6508 // compile time pointlessly swapping and checking legality again.
6509 if (HasImplicitSGPR || !MI.isCommutable()) {
6510 legalizeOpWithMove(MI, Src1Idx);
6511 return;
6512 }
6513
6514 // If src0 can be used as src1, commuting will make the operands legal.
6515 // Otherwise we have to give up and insert a move.
6516 //
6517 // TODO: Other immediate-like operand kinds could be commuted if there was a
6518 // MachineOperand::ChangeTo* for them.
6519 if ((!Src1.isImm() && !Src1.isReg()) ||
6520 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6521 legalizeOpWithMove(MI, Src1Idx);
6522 return;
6523 }
6524
6525 int CommutedOpc = commuteOpcode(MI);
6526 if (CommutedOpc == -1) {
6527 legalizeOpWithMove(MI, Src1Idx);
6528 return;
6529 }
6530
6531 MI.setDesc(get(CommutedOpc));
6532
6533 Register Src0Reg = Src0.getReg();
6534 unsigned Src0SubReg = Src0.getSubReg();
6535 bool Src0Kill = Src0.isKill();
6536
6537 if (Src1.isImm())
6538 Src0.ChangeToImmediate(Src1.getImm());
6539 else if (Src1.isReg()) {
6540 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6541 Src0.setSubReg(Src1.getSubReg());
6542 } else
6543 llvm_unreachable("Should only have register or immediate operands");
6544
6545 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6546 Src1.setSubReg(Src0SubReg);
6548}
6549
6550// Legalize VOP3 operands. All operand types are supported for any operand
6551// but only one literal constant and only starting from GFX10.
6553 MachineInstr &MI) const {
6554 unsigned Opc = MI.getOpcode();
6555
6556 int VOP3Idx[3] = {
6557 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6558 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6559 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6560 };
6561
6562 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6563 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6564 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6565 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6566 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6567 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6568 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6569 // src1 and src2 must be scalar
6570 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6571 const DebugLoc &DL = MI.getDebugLoc();
6572 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6573 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6574 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6575 .add(Src1);
6576 Src1.ChangeToRegister(Reg, false);
6577 }
6578 if (VOP3Idx[2] != -1) {
6579 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6580 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6581 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6582 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6583 .add(Src2);
6584 Src2.ChangeToRegister(Reg, false);
6585 }
6586 }
6587 }
6588
6589 // Find the one SGPR operand we are allowed to use.
6590 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6591 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6592 SmallDenseSet<unsigned> SGPRsUsed;
6593 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6594 if (SGPRReg) {
6595 SGPRsUsed.insert(SGPRReg);
6596 --ConstantBusLimit;
6597 }
6598
6599 for (int Idx : VOP3Idx) {
6600 if (Idx == -1)
6601 break;
6602 MachineOperand &MO = MI.getOperand(Idx);
6603
6604 if (!MO.isReg()) {
6605 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6606 continue;
6607
6608 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6609 --LiteralLimit;
6610 --ConstantBusLimit;
6611 continue;
6612 }
6613
6614 --LiteralLimit;
6615 --ConstantBusLimit;
6616 legalizeOpWithMove(MI, Idx);
6617 continue;
6618 }
6619
6620 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6621 continue; // VGPRs are legal
6622
6623 // We can use one SGPR in each VOP3 instruction prior to GFX10
6624 // and two starting from GFX10.
6625 if (SGPRsUsed.count(MO.getReg()))
6626 continue;
6627 if (ConstantBusLimit > 0) {
6628 SGPRsUsed.insert(MO.getReg());
6629 --ConstantBusLimit;
6630 continue;
6631 }
6632
6633 // If we make it this far, then the operand is not legal and we must
6634 // legalize it.
6635 legalizeOpWithMove(MI, Idx);
6636 }
6637
6638 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6639 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6640 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6641 legalizeOpWithMove(MI, VOP3Idx[2]);
6642
6643 // Fix the register class of packed FP32 instructions on gfx12+. See
6644 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6646 for (unsigned I = 0; I < 3; ++I) {
6648 legalizeOpWithMove(MI, VOP3Idx[I]);
6649 }
6650 }
6651}
6652
6655 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6656 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6657 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6658 if (DstRC)
6659 SRC = RI.getCommonSubClass(SRC, DstRC);
6660
6661 Register DstReg = MRI.createVirtualRegister(SRC);
6662 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6663
6664 if (RI.hasAGPRs(VRC)) {
6665 VRC = RI.getEquivalentVGPRClass(VRC);
6666 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6667 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6668 get(TargetOpcode::COPY), NewSrcReg)
6669 .addReg(SrcReg);
6670 SrcReg = NewSrcReg;
6671 }
6672
6673 if (SubRegs == 1) {
6674 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6675 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6676 .addReg(SrcReg);
6677 return DstReg;
6678 }
6679
6681 for (unsigned i = 0; i < SubRegs; ++i) {
6682 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6683 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6684 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6685 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6686 SRegs.push_back(SGPR);
6687 }
6688
6690 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6691 get(AMDGPU::REG_SEQUENCE), DstReg);
6692 for (unsigned i = 0; i < SubRegs; ++i) {
6693 MIB.addReg(SRegs[i]);
6694 MIB.addImm(RI.getSubRegFromChannel(i));
6695 }
6696 return DstReg;
6697}
6698
6700 MachineInstr &MI) const {
6701
6702 // If the pointer is store in VGPRs, then we need to move them to
6703 // SGPRs using v_readfirstlane. This is safe because we only select
6704 // loads with uniform pointers to SMRD instruction so we know the
6705 // pointer value is uniform.
6706 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6707 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6708 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6709 SBase->setReg(SGPR);
6710 }
6711 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6712 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6713 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6714 SOff->setReg(SGPR);
6715 }
6716}
6717
6719 unsigned Opc = Inst.getOpcode();
6720 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6721 if (OldSAddrIdx < 0)
6722 return false;
6723
6724 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6725
6726 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6727 if (NewOpc < 0)
6729 if (NewOpc < 0)
6730 return false;
6731
6733 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6734 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6735 return false;
6736
6737 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6738 if (NewVAddrIdx < 0)
6739 return false;
6740
6741 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6742
6743 // Check vaddr, it shall be zero or absent.
6744 MachineInstr *VAddrDef = nullptr;
6745 if (OldVAddrIdx >= 0) {
6746 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6747 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6748 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6749 !VAddrDef->getOperand(1).isImm() ||
6750 VAddrDef->getOperand(1).getImm() != 0)
6751 return false;
6752 }
6753
6754 const MCInstrDesc &NewDesc = get(NewOpc);
6755 Inst.setDesc(NewDesc);
6756
6757 // Callers expect iterator to be valid after this call, so modify the
6758 // instruction in place.
6759 if (OldVAddrIdx == NewVAddrIdx) {
6760 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6761 // Clear use list from the old vaddr holding a zero register.
6762 MRI.removeRegOperandFromUseList(&NewVAddr);
6763 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6764 Inst.removeOperand(OldSAddrIdx);
6765 // Update the use list with the pointer we have just moved from vaddr to
6766 // saddr position. Otherwise new vaddr will be missing from the use list.
6767 MRI.removeRegOperandFromUseList(&NewVAddr);
6768 MRI.addRegOperandToUseList(&NewVAddr);
6769 } else {
6770 assert(OldSAddrIdx == NewVAddrIdx);
6771
6772 if (OldVAddrIdx >= 0) {
6773 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6774 AMDGPU::OpName::vdst_in);
6775
6776 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6777 // it asserts. Untie the operands for now and retie them afterwards.
6778 if (NewVDstIn != -1) {
6779 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6780 Inst.untieRegOperand(OldVDstIn);
6781 }
6782
6783 Inst.removeOperand(OldVAddrIdx);
6784
6785 if (NewVDstIn != -1) {
6786 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6787 Inst.tieOperands(NewVDst, NewVDstIn);
6788 }
6789 }
6790 }
6791
6792 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6793 VAddrDef->eraseFromParent();
6794
6795 return true;
6796}
6797
6798// FIXME: Remove this when SelectionDAG is obsoleted.
6800 MachineInstr &MI) const {
6801 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6802 return;
6803
6804 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6805 // thinks they are uniform, so a readfirstlane should be valid.
6806 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6807 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6808 return;
6809
6811 return;
6812
6813 const TargetRegisterClass *DeclaredRC =
6814 getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI);
6815
6816 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6817 SAddr->setReg(ToSGPR);
6818}
6819
6822 const TargetRegisterClass *DstRC,
6825 const DebugLoc &DL) const {
6826 Register OpReg = Op.getReg();
6827 unsigned OpSubReg = Op.getSubReg();
6828
6829 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6830 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6831
6832 // Check if operand is already the correct register class.
6833 if (DstRC == OpRC)
6834 return;
6835
6836 Register DstReg = MRI.createVirtualRegister(DstRC);
6837 auto Copy =
6838 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6839 Op.setReg(DstReg);
6840
6841 MachineInstr *Def = MRI.getVRegDef(OpReg);
6842 if (!Def)
6843 return;
6844
6845 // Try to eliminate the copy if it is copying an immediate value.
6846 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6847 foldImmediate(*Copy, *Def, OpReg, &MRI);
6848
6849 bool ImpDef = Def->isImplicitDef();
6850 while (!ImpDef && Def && Def->isCopy()) {
6851 if (Def->getOperand(1).getReg().isPhysical())
6852 break;
6853 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6854 ImpDef = Def && Def->isImplicitDef();
6855 }
6856 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6857 !ImpDef)
6858 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6859}
6860
6861// Emit the actual waterfall loop, executing the wrapped instruction for each
6862// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6863// iteration, in the worst case we execute 64 (once per lane).
6864static void
6867 MachineBasicBlock &LoopBB,
6868 MachineBasicBlock &BodyBB,
6869 const DebugLoc &DL,
6870 ArrayRef<MachineOperand *> ScalarOps) {
6871 MachineFunction &MF = *LoopBB.getParent();
6872 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6873 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6875 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6876
6878 Register CondReg;
6879
6880 for (MachineOperand *ScalarOp : ScalarOps) {
6881 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6882 unsigned NumSubRegs = RegSize / 32;
6883 Register VScalarOp = ScalarOp->getReg();
6884
6885 if (NumSubRegs == 1) {
6886 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6887
6888 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6889 .addReg(VScalarOp);
6890
6891 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6892
6893 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6894 .addReg(CurReg)
6895 .addReg(VScalarOp);
6896
6897 // Combine the comparison results with AND.
6898 if (!CondReg) // First.
6899 CondReg = NewCondReg;
6900 else { // If not the first, we create an AND.
6901 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6902 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
6903 .addReg(CondReg)
6904 .addReg(NewCondReg);
6905 CondReg = AndReg;
6906 }
6907
6908 // Update ScalarOp operand to use the SGPR ScalarOp.
6909 ScalarOp->setReg(CurReg);
6910 ScalarOp->setIsKill();
6911 } else {
6912 SmallVector<Register, 8> ReadlanePieces;
6913 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6914 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6915 "Unhandled register size");
6916
6917 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6918 Register CurRegLo =
6919 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6920 Register CurRegHi =
6921 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6922
6923 // Read the next variant <- also loop target.
6924 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6925 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6926
6927 // Read the next variant <- also loop target.
6928 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6929 .addReg(VScalarOp, VScalarOpUndef,
6930 TRI->getSubRegFromChannel(Idx + 1));
6931
6932 ReadlanePieces.push_back(CurRegLo);
6933 ReadlanePieces.push_back(CurRegHi);
6934
6935 // Comparison is to be done as 64-bit.
6936 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6937 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6938 .addReg(CurRegLo)
6939 .addImm(AMDGPU::sub0)
6940 .addReg(CurRegHi)
6941 .addImm(AMDGPU::sub1);
6942
6943 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6944 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6945 NewCondReg)
6946 .addReg(CurReg);
6947 if (NumSubRegs <= 2)
6948 Cmp.addReg(VScalarOp);
6949 else
6950 Cmp.addReg(VScalarOp, VScalarOpUndef,
6951 TRI->getSubRegFromChannel(Idx, 2));
6952
6953 // Combine the comparison results with AND.
6954 if (!CondReg) // First.
6955 CondReg = NewCondReg;
6956 else { // If not the first, we create an AND.
6957 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6958 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
6959 .addReg(CondReg)
6960 .addReg(NewCondReg);
6961 CondReg = AndReg;
6962 }
6963 } // End for loop.
6964
6965 const auto *SScalarOpRC =
6966 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6967 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6968
6969 // Build scalar ScalarOp.
6970 auto Merge =
6971 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6972 unsigned Channel = 0;
6973 for (Register Piece : ReadlanePieces) {
6974 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6975 }
6976
6977 // Update ScalarOp operand to use the SGPR ScalarOp.
6978 ScalarOp->setReg(SScalarOp);
6979 ScalarOp->setIsKill();
6980 }
6981 }
6982
6983 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6984 MRI.setSimpleHint(SaveExec, CondReg);
6985
6986 // Update EXEC to matching lanes, saving original to SaveExec.
6987 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
6988 .addReg(CondReg, RegState::Kill);
6989
6990 // The original instruction is here; we insert the terminators after it.
6991 I = BodyBB.end();
6992
6993 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6994 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
6995 .addReg(LMC.ExecReg)
6996 .addReg(SaveExec);
6997
6998 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6999}
7000
7001// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7002// with SGPRs by iterating over all unique values across all lanes.
7003// Returns the loop basic block that now contains \p MI.
7004static MachineBasicBlock *
7008 MachineBasicBlock::iterator Begin = nullptr,
7009 MachineBasicBlock::iterator End = nullptr) {
7010 MachineBasicBlock &MBB = *MI.getParent();
7011 MachineFunction &MF = *MBB.getParent();
7012 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7013 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7015 if (!Begin.isValid())
7016 Begin = &MI;
7017 if (!End.isValid()) {
7018 End = &MI;
7019 ++End;
7020 }
7021 const DebugLoc &DL = MI.getDebugLoc();
7023 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7024
7025 // Save SCC. Waterfall Loop may overwrite SCC.
7026 Register SaveSCCReg;
7027
7028 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7029 // rather than unlimited scan everywhere
7030 bool SCCNotDead =
7031 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7032 std::numeric_limits<unsigned>::max()) !=
7034 if (SCCNotDead) {
7035 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7036 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7037 .addImm(1)
7038 .addImm(0);
7039 }
7040
7041 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7042
7043 // Save the EXEC mask
7044 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7045
7046 // Killed uses in the instruction we are waterfalling around will be
7047 // incorrect due to the added control-flow.
7049 ++AfterMI;
7050 for (auto I = Begin; I != AfterMI; I++) {
7051 for (auto &MO : I->all_uses())
7052 MRI.clearKillFlags(MO.getReg());
7053 }
7054
7055 // To insert the loop we need to split the block. Move everything after this
7056 // point to a new block, and insert a new empty block between the two.
7059 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7061 ++MBBI;
7062
7063 MF.insert(MBBI, LoopBB);
7064 MF.insert(MBBI, BodyBB);
7065 MF.insert(MBBI, RemainderBB);
7066
7067 LoopBB->addSuccessor(BodyBB);
7068 BodyBB->addSuccessor(LoopBB);
7069 BodyBB->addSuccessor(RemainderBB);
7070
7071 // Move Begin to MI to the BodyBB, and the remainder of the block to
7072 // RemainderBB.
7073 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7074 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7075 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7076
7077 MBB.addSuccessor(LoopBB);
7078
7079 // Update dominators. We know that MBB immediately dominates LoopBB, that
7080 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7081 // RemainderBB. RemainderBB immediately dominates all of the successors
7082 // transferred to it from MBB that MBB used to properly dominate.
7083 if (MDT) {
7084 MDT->addNewBlock(LoopBB, &MBB);
7085 MDT->addNewBlock(BodyBB, LoopBB);
7086 MDT->addNewBlock(RemainderBB, BodyBB);
7087 for (auto &Succ : RemainderBB->successors()) {
7088 if (MDT->properlyDominates(&MBB, Succ)) {
7089 MDT->changeImmediateDominator(Succ, RemainderBB);
7090 }
7091 }
7092 }
7093
7094 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7095
7096 MachineBasicBlock::iterator First = RemainderBB->begin();
7097 // Restore SCC
7098 if (SCCNotDead) {
7099 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7100 .addReg(SaveSCCReg, RegState::Kill)
7101 .addImm(0);
7102 }
7103
7104 // Restore the EXEC mask
7105 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7106 .addReg(SaveExec);
7107 return BodyBB;
7108}
7109
7110// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7111static std::tuple<unsigned, unsigned>
7113 MachineBasicBlock &MBB = *MI.getParent();
7114 MachineFunction &MF = *MBB.getParent();
7116
7117 // Extract the ptr from the resource descriptor.
7118 unsigned RsrcPtr =
7119 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7120 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7121
7122 // Create an empty resource descriptor
7123 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7124 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7125 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7126 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7127 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7128
7129 // Zero64 = 0
7130 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7131 .addImm(0);
7132
7133 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7134 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7135 .addImm(Lo_32(RsrcDataFormat));
7136
7137 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7138 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7139 .addImm(Hi_32(RsrcDataFormat));
7140
7141 // NewSRsrc = {Zero64, SRsrcFormat}
7142 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7143 .addReg(Zero64)
7144 .addImm(AMDGPU::sub0_sub1)
7145 .addReg(SRsrcFormatLo)
7146 .addImm(AMDGPU::sub2)
7147 .addReg(SRsrcFormatHi)
7148 .addImm(AMDGPU::sub3);
7149
7150 return std::tuple(RsrcPtr, NewSRsrc);
7151}
7152
7155 MachineDominatorTree *MDT) const {
7156 MachineFunction &MF = *MI.getParent()->getParent();
7158 MachineBasicBlock *CreatedBB = nullptr;
7159
7160 // Legalize VOP2
7161 if (isVOP2(MI) || isVOPC(MI)) {
7163 return CreatedBB;
7164 }
7165
7166 // Legalize VOP3
7167 if (isVOP3(MI)) {
7169 return CreatedBB;
7170 }
7171
7172 // Legalize SMRD
7173 if (isSMRD(MI)) {
7175 return CreatedBB;
7176 }
7177
7178 // Legalize FLAT
7179 if (isFLAT(MI)) {
7181 return CreatedBB;
7182 }
7183
7184 // Legalize REG_SEQUENCE and PHI
7185 // The register class of the operands much be the same type as the register
7186 // class of the output.
7187 if (MI.getOpcode() == AMDGPU::PHI) {
7188 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
7189 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
7190 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
7191 continue;
7192 const TargetRegisterClass *OpRC =
7193 MRI.getRegClass(MI.getOperand(i).getReg());
7194 if (RI.hasVectorRegisters(OpRC)) {
7195 VRC = OpRC;
7196 } else {
7197 SRC = OpRC;
7198 }
7199 }
7200
7201 // If any of the operands are VGPR registers, then they all most be
7202 // otherwise we will create illegal VGPR->SGPR copies when legalizing
7203 // them.
7204 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
7205 if (!VRC) {
7206 assert(SRC);
7207 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
7208 VRC = &AMDGPU::VReg_1RegClass;
7209 } else
7210 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7211 ? RI.getEquivalentAGPRClass(SRC)
7212 : RI.getEquivalentVGPRClass(SRC);
7213 } else {
7214 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7215 ? RI.getEquivalentAGPRClass(VRC)
7216 : RI.getEquivalentVGPRClass(VRC);
7217 }
7218 RC = VRC;
7219 } else {
7220 RC = SRC;
7221 }
7222
7223 // Update all the operands so they have the same type.
7224 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7225 MachineOperand &Op = MI.getOperand(I);
7226 if (!Op.isReg() || !Op.getReg().isVirtual())
7227 continue;
7228
7229 // MI is a PHI instruction.
7230 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7232
7233 // Avoid creating no-op copies with the same src and dst reg class. These
7234 // confuse some of the machine passes.
7235 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
7236 }
7237 }
7238
7239 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7240 // VGPR dest type and SGPR sources, insert copies so all operands are
7241 // VGPRs. This seems to help operand folding / the register coalescer.
7242 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7243 MachineBasicBlock *MBB = MI.getParent();
7244 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7245 if (RI.hasVGPRs(DstRC)) {
7246 // Update all the operands so they are VGPR register classes. These may
7247 // not be the same register class because REG_SEQUENCE supports mixing
7248 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7249 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7250 MachineOperand &Op = MI.getOperand(I);
7251 if (!Op.isReg() || !Op.getReg().isVirtual())
7252 continue;
7253
7254 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7255 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7256 if (VRC == OpRC)
7257 continue;
7258
7259 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7260 Op.setIsKill();
7261 }
7262 }
7263
7264 return CreatedBB;
7265 }
7266
7267 // Legalize INSERT_SUBREG
7268 // src0 must have the same register class as dst
7269 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7270 Register Dst = MI.getOperand(0).getReg();
7271 Register Src0 = MI.getOperand(1).getReg();
7272 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7273 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7274 if (DstRC != Src0RC) {
7275 MachineBasicBlock *MBB = MI.getParent();
7276 MachineOperand &Op = MI.getOperand(1);
7277 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7278 }
7279 return CreatedBB;
7280 }
7281
7282 // Legalize SI_INIT_M0
7283 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7284 MachineOperand &Src = MI.getOperand(0);
7285 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7286 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7287 return CreatedBB;
7288 }
7289
7290 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7291 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7292 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7293 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7294 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7295 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7296 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7297 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7298 MachineOperand &Src = MI.getOperand(1);
7299 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7300 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7301 return CreatedBB;
7302 }
7303
7304 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7305 //
7306 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7307 // scratch memory access. In both cases, the legalization never involves
7308 // conversion to the addr64 form.
7310 (isMUBUF(MI) || isMTBUF(MI)))) {
7311 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7312 ? AMDGPU::OpName::rsrc
7313 : AMDGPU::OpName::srsrc;
7314 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7315 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7316 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7317
7318 AMDGPU::OpName SampOpName =
7319 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7320 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7321 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7322 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7323
7324 return CreatedBB;
7325 }
7326
7327 // Legalize SI_CALL
7328 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7329 MachineOperand *Dest = &MI.getOperand(0);
7330 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7331 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7332 // following copies, we also need to move copies from and to physical
7333 // registers into the loop block.
7334 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7335 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7336
7337 // Also move the copies to physical registers into the loop block
7338 MachineBasicBlock &MBB = *MI.getParent();
7340 while (Start->getOpcode() != FrameSetupOpcode)
7341 --Start;
7343 while (End->getOpcode() != FrameDestroyOpcode)
7344 ++End;
7345 // Also include following copies of the return value
7346 ++End;
7347 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7348 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7349 ++End;
7350 CreatedBB =
7351 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7352 }
7353 }
7354
7355 // Legalize s_sleep_var.
7356 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7357 const DebugLoc &DL = MI.getDebugLoc();
7358 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7359 int Src0Idx =
7360 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7361 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7362 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7363 .add(Src0);
7364 Src0.ChangeToRegister(Reg, false);
7365 return nullptr;
7366 }
7367
7368 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7369 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7370 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7371 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7372 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7373 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7374 for (MachineOperand &Src : MI.explicit_operands()) {
7375 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7376 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7377 }
7378 return CreatedBB;
7379 }
7380
7381 // Legalize MUBUF instructions.
7382 bool isSoffsetLegal = true;
7383 int SoffsetIdx =
7384 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7385 if (SoffsetIdx != -1) {
7386 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7387 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7388 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7389 isSoffsetLegal = false;
7390 }
7391 }
7392
7393 bool isRsrcLegal = true;
7394 int RsrcIdx =
7395 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7396 if (RsrcIdx != -1) {
7397 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7398 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7399 isRsrcLegal = false;
7400 }
7401
7402 // The operands are legal.
7403 if (isRsrcLegal && isSoffsetLegal)
7404 return CreatedBB;
7405
7406 if (!isRsrcLegal) {
7407 // Legalize a VGPR Rsrc
7408 //
7409 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7410 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7411 // a zero-value SRsrc.
7412 //
7413 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7414 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7415 // above.
7416 //
7417 // Otherwise we are on non-ADDR64 hardware, and/or we have
7418 // idxen/offen/bothen and we fall back to a waterfall loop.
7419
7420 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7421 MachineBasicBlock &MBB = *MI.getParent();
7422
7423 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7424 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7425 // This is already an ADDR64 instruction so we need to add the pointer
7426 // extracted from the resource descriptor to the current value of VAddr.
7427 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7428 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7429 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7430
7431 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7432 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7433 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7434
7435 unsigned RsrcPtr, NewSRsrc;
7436 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7437
7438 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7439 const DebugLoc &DL = MI.getDebugLoc();
7440 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7441 .addDef(CondReg0)
7442 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7443 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7444 .addImm(0);
7445
7446 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7447 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7448 .addDef(CondReg1, RegState::Dead)
7449 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7450 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7451 .addReg(CondReg0, RegState::Kill)
7452 .addImm(0);
7453
7454 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7455 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7456 .addReg(NewVAddrLo)
7457 .addImm(AMDGPU::sub0)
7458 .addReg(NewVAddrHi)
7459 .addImm(AMDGPU::sub1);
7460
7461 VAddr->setReg(NewVAddr);
7462 Rsrc->setReg(NewSRsrc);
7463 } else if (!VAddr && ST.hasAddr64()) {
7464 // This instructions is the _OFFSET variant, so we need to convert it to
7465 // ADDR64.
7466 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7467 "FIXME: Need to emit flat atomics here");
7468
7469 unsigned RsrcPtr, NewSRsrc;
7470 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7471
7472 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7473 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7474 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7475 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7476 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7477
7478 // Atomics with return have an additional tied operand and are
7479 // missing some of the special bits.
7480 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7481 MachineInstr *Addr64;
7482
7483 if (!VDataIn) {
7484 // Regular buffer load / store.
7486 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7487 .add(*VData)
7488 .addReg(NewVAddr)
7489 .addReg(NewSRsrc)
7490 .add(*SOffset)
7491 .add(*Offset);
7492
7493 if (const MachineOperand *CPol =
7494 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7495 MIB.addImm(CPol->getImm());
7496 }
7497
7498 if (const MachineOperand *TFE =
7499 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7500 MIB.addImm(TFE->getImm());
7501 }
7502
7503 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7504
7505 MIB.cloneMemRefs(MI);
7506 Addr64 = MIB;
7507 } else {
7508 // Atomics with return.
7509 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7510 .add(*VData)
7511 .add(*VDataIn)
7512 .addReg(NewVAddr)
7513 .addReg(NewSRsrc)
7514 .add(*SOffset)
7515 .add(*Offset)
7516 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7517 .cloneMemRefs(MI);
7518 }
7519
7520 MI.removeFromParent();
7521
7522 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7523 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7524 NewVAddr)
7525 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7526 .addImm(AMDGPU::sub0)
7527 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7528 .addImm(AMDGPU::sub1);
7529 } else {
7530 // Legalize a VGPR Rsrc and soffset together.
7531 if (!isSoffsetLegal) {
7532 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7533 CreatedBB =
7534 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7535 return CreatedBB;
7536 }
7537 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7538 return CreatedBB;
7539 }
7540 }
7541
7542 // Legalize a VGPR soffset.
7543 if (!isSoffsetLegal) {
7544 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7545 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7546 return CreatedBB;
7547 }
7548 return CreatedBB;
7549}
7550
7552 InstrList.insert(MI);
7553 // Add MBUF instructiosn to deferred list.
7554 int RsrcIdx =
7555 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7556 if (RsrcIdx != -1) {
7557 DeferredList.insert(MI);
7558 }
7559}
7560
7562 return DeferredList.contains(MI);
7563}
7564
7565// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7566// lowering (change spgr to vgpr).
7567// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7568// size. Need to legalize the size of the operands during the vgpr lowering
7569// chain. This can be removed after we have sgpr16 in place
7571 MachineRegisterInfo &MRI) const {
7572 if (!ST.useRealTrue16Insts())
7573 return;
7574
7575 unsigned Opcode = MI.getOpcode();
7576 MachineBasicBlock *MBB = MI.getParent();
7577 // Legalize operands and check for size mismatch
7578 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7579 OpIdx >= get(Opcode).getNumOperands() ||
7580 get(Opcode).operands()[OpIdx].RegClass == -1)
7581 return;
7582
7583 MachineOperand &Op = MI.getOperand(OpIdx);
7584 if (!Op.isReg() || !Op.getReg().isVirtual())
7585 return;
7586
7587 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7588 if (!RI.isVGPRClass(CurrRC))
7589 return;
7590
7591 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7592 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7593 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7594 Op.setSubReg(AMDGPU::lo16);
7595 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7596 const DebugLoc &DL = MI.getDebugLoc();
7597 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7598 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7599 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7600 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7601 .addReg(Op.getReg())
7602 .addImm(AMDGPU::lo16)
7603 .addReg(Undef)
7604 .addImm(AMDGPU::hi16);
7605 Op.setReg(NewDstReg);
7606 }
7607}
7609 MachineRegisterInfo &MRI) const {
7610 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7612}
7613
7615 MachineDominatorTree *MDT) const {
7616
7617 while (!Worklist.empty()) {
7618 MachineInstr &Inst = *Worklist.top();
7619 Worklist.erase_top();
7620 // Skip MachineInstr in the deferred list.
7621 if (Worklist.isDeferred(&Inst))
7622 continue;
7623 moveToVALUImpl(Worklist, MDT, Inst);
7624 }
7625
7626 // Deferred list of instructions will be processed once
7627 // all the MachineInstr in the worklist are done.
7628 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7629 moveToVALUImpl(Worklist, MDT, *Inst);
7630 assert(Worklist.empty() &&
7631 "Deferred MachineInstr are not supposed to re-populate worklist");
7632 }
7633}
7634
7637 MachineInstr &Inst) const {
7638
7640 if (!MBB)
7641 return;
7642 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7643 unsigned Opcode = Inst.getOpcode();
7644 unsigned NewOpcode = getVALUOp(Inst);
7645 // Handle some special cases
7646 switch (Opcode) {
7647 default:
7648 break;
7649 case AMDGPU::S_ADD_I32:
7650 case AMDGPU::S_SUB_I32: {
7651 // FIXME: The u32 versions currently selected use the carry.
7652 bool Changed;
7653 MachineBasicBlock *CreatedBBTmp = nullptr;
7654 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7655 if (Changed)
7656 return;
7657
7658 // Default handling
7659 break;
7660 }
7661
7662 case AMDGPU::S_MUL_U64:
7663 if (ST.hasVectorMulU64()) {
7664 NewOpcode = AMDGPU::V_MUL_U64_e64;
7665 break;
7666 }
7667 // Split s_mul_u64 in 32-bit vector multiplications.
7668 splitScalarSMulU64(Worklist, Inst, MDT);
7669 Inst.eraseFromParent();
7670 return;
7671
7672 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7673 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7674 // This is a special case of s_mul_u64 where all the operands are either
7675 // zero extended or sign extended.
7676 splitScalarSMulPseudo(Worklist, Inst, MDT);
7677 Inst.eraseFromParent();
7678 return;
7679
7680 case AMDGPU::S_AND_B64:
7681 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7682 Inst.eraseFromParent();
7683 return;
7684
7685 case AMDGPU::S_OR_B64:
7686 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7687 Inst.eraseFromParent();
7688 return;
7689
7690 case AMDGPU::S_XOR_B64:
7691 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7692 Inst.eraseFromParent();
7693 return;
7694
7695 case AMDGPU::S_NAND_B64:
7696 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7697 Inst.eraseFromParent();
7698 return;
7699
7700 case AMDGPU::S_NOR_B64:
7701 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7702 Inst.eraseFromParent();
7703 return;
7704
7705 case AMDGPU::S_XNOR_B64:
7706 if (ST.hasDLInsts())
7707 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7708 else
7709 splitScalar64BitXnor(Worklist, Inst, MDT);
7710 Inst.eraseFromParent();
7711 return;
7712
7713 case AMDGPU::S_ANDN2_B64:
7714 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7715 Inst.eraseFromParent();
7716 return;
7717
7718 case AMDGPU::S_ORN2_B64:
7719 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7720 Inst.eraseFromParent();
7721 return;
7722
7723 case AMDGPU::S_BREV_B64:
7724 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7725 Inst.eraseFromParent();
7726 return;
7727
7728 case AMDGPU::S_NOT_B64:
7729 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7730 Inst.eraseFromParent();
7731 return;
7732
7733 case AMDGPU::S_BCNT1_I32_B64:
7734 splitScalar64BitBCNT(Worklist, Inst);
7735 Inst.eraseFromParent();
7736 return;
7737
7738 case AMDGPU::S_BFE_I64:
7739 splitScalar64BitBFE(Worklist, Inst);
7740 Inst.eraseFromParent();
7741 return;
7742
7743 case AMDGPU::S_FLBIT_I32_B64:
7744 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7745 Inst.eraseFromParent();
7746 return;
7747 case AMDGPU::S_FF1_I32_B64:
7748 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7749 Inst.eraseFromParent();
7750 return;
7751
7752 case AMDGPU::S_LSHL_B32:
7753 if (ST.hasOnlyRevVALUShifts()) {
7754 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7755 swapOperands(Inst);
7756 }
7757 break;
7758 case AMDGPU::S_ASHR_I32:
7759 if (ST.hasOnlyRevVALUShifts()) {
7760 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7761 swapOperands(Inst);
7762 }
7763 break;
7764 case AMDGPU::S_LSHR_B32:
7765 if (ST.hasOnlyRevVALUShifts()) {
7766 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7767 swapOperands(Inst);
7768 }
7769 break;
7770 case AMDGPU::S_LSHL_B64:
7771 if (ST.hasOnlyRevVALUShifts()) {
7772 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7773 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7774 : AMDGPU::V_LSHLREV_B64_e64;
7775 swapOperands(Inst);
7776 }
7777 break;
7778 case AMDGPU::S_ASHR_I64:
7779 if (ST.hasOnlyRevVALUShifts()) {
7780 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7781 swapOperands(Inst);
7782 }
7783 break;
7784 case AMDGPU::S_LSHR_B64:
7785 if (ST.hasOnlyRevVALUShifts()) {
7786 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7787 swapOperands(Inst);
7788 }
7789 break;
7790
7791 case AMDGPU::S_ABS_I32:
7792 lowerScalarAbs(Worklist, Inst);
7793 Inst.eraseFromParent();
7794 return;
7795
7796 case AMDGPU::S_CBRANCH_SCC0:
7797 case AMDGPU::S_CBRANCH_SCC1: {
7798 // Clear unused bits of vcc
7799 Register CondReg = Inst.getOperand(1).getReg();
7800 bool IsSCC = CondReg == AMDGPU::SCC;
7802 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
7803 .addReg(LMC.ExecReg)
7804 .addReg(IsSCC ? LMC.VccReg : CondReg);
7805 Inst.removeOperand(1);
7806 } break;
7807
7808 case AMDGPU::S_BFE_U64:
7809 case AMDGPU::S_BFM_B64:
7810 llvm_unreachable("Moving this op to VALU not implemented");
7811
7812 case AMDGPU::S_PACK_LL_B32_B16:
7813 case AMDGPU::S_PACK_LH_B32_B16:
7814 case AMDGPU::S_PACK_HL_B32_B16:
7815 case AMDGPU::S_PACK_HH_B32_B16:
7816 movePackToVALU(Worklist, MRI, Inst);
7817 Inst.eraseFromParent();
7818 return;
7819
7820 case AMDGPU::S_XNOR_B32:
7821 lowerScalarXnor(Worklist, Inst);
7822 Inst.eraseFromParent();
7823 return;
7824
7825 case AMDGPU::S_NAND_B32:
7826 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7827 Inst.eraseFromParent();
7828 return;
7829
7830 case AMDGPU::S_NOR_B32:
7831 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7832 Inst.eraseFromParent();
7833 return;
7834
7835 case AMDGPU::S_ANDN2_B32:
7836 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7837 Inst.eraseFromParent();
7838 return;
7839
7840 case AMDGPU::S_ORN2_B32:
7841 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7842 Inst.eraseFromParent();
7843 return;
7844
7845 // TODO: remove as soon as everything is ready
7846 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7847 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7848 // can only be selected from the uniform SDNode.
7849 case AMDGPU::S_ADD_CO_PSEUDO:
7850 case AMDGPU::S_SUB_CO_PSEUDO: {
7851 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7852 ? AMDGPU::V_ADDC_U32_e64
7853 : AMDGPU::V_SUBB_U32_e64;
7854 const auto *CarryRC = RI.getWaveMaskRegClass();
7855
7856 Register CarryInReg = Inst.getOperand(4).getReg();
7857 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7858 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7859 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7860 .addReg(CarryInReg);
7861 }
7862
7863 Register CarryOutReg = Inst.getOperand(1).getReg();
7864
7865 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7866 MRI.getRegClass(Inst.getOperand(0).getReg())));
7867 MachineInstr *CarryOp =
7868 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7869 .addReg(CarryOutReg, RegState::Define)
7870 .add(Inst.getOperand(2))
7871 .add(Inst.getOperand(3))
7872 .addReg(CarryInReg)
7873 .addImm(0);
7874 legalizeOperands(*CarryOp);
7875 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7876 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7877 Inst.eraseFromParent();
7878 }
7879 return;
7880 case AMDGPU::S_UADDO_PSEUDO:
7881 case AMDGPU::S_USUBO_PSEUDO: {
7882 const DebugLoc &DL = Inst.getDebugLoc();
7883 MachineOperand &Dest0 = Inst.getOperand(0);
7884 MachineOperand &Dest1 = Inst.getOperand(1);
7885 MachineOperand &Src0 = Inst.getOperand(2);
7886 MachineOperand &Src1 = Inst.getOperand(3);
7887
7888 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7889 ? AMDGPU::V_ADD_CO_U32_e64
7890 : AMDGPU::V_SUB_CO_U32_e64;
7891 const TargetRegisterClass *NewRC =
7892 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7893 Register DestReg = MRI.createVirtualRegister(NewRC);
7894 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7895 .addReg(Dest1.getReg(), RegState::Define)
7896 .add(Src0)
7897 .add(Src1)
7898 .addImm(0); // clamp bit
7899
7900 legalizeOperands(*NewInstr, MDT);
7901 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7902 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7903 Worklist);
7904 Inst.eraseFromParent();
7905 }
7906 return;
7907
7908 case AMDGPU::S_CSELECT_B32:
7909 case AMDGPU::S_CSELECT_B64:
7910 lowerSelect(Worklist, Inst, MDT);
7911 Inst.eraseFromParent();
7912 return;
7913 case AMDGPU::S_CMP_EQ_I32:
7914 case AMDGPU::S_CMP_LG_I32:
7915 case AMDGPU::S_CMP_GT_I32:
7916 case AMDGPU::S_CMP_GE_I32:
7917 case AMDGPU::S_CMP_LT_I32:
7918 case AMDGPU::S_CMP_LE_I32:
7919 case AMDGPU::S_CMP_EQ_U32:
7920 case AMDGPU::S_CMP_LG_U32:
7921 case AMDGPU::S_CMP_GT_U32:
7922 case AMDGPU::S_CMP_GE_U32:
7923 case AMDGPU::S_CMP_LT_U32:
7924 case AMDGPU::S_CMP_LE_U32:
7925 case AMDGPU::S_CMP_EQ_U64:
7926 case AMDGPU::S_CMP_LG_U64:
7927 case AMDGPU::S_CMP_LT_F32:
7928 case AMDGPU::S_CMP_EQ_F32:
7929 case AMDGPU::S_CMP_LE_F32:
7930 case AMDGPU::S_CMP_GT_F32:
7931 case AMDGPU::S_CMP_LG_F32:
7932 case AMDGPU::S_CMP_GE_F32:
7933 case AMDGPU::S_CMP_O_F32:
7934 case AMDGPU::S_CMP_U_F32:
7935 case AMDGPU::S_CMP_NGE_F32:
7936 case AMDGPU::S_CMP_NLG_F32:
7937 case AMDGPU::S_CMP_NGT_F32:
7938 case AMDGPU::S_CMP_NLE_F32:
7939 case AMDGPU::S_CMP_NEQ_F32:
7940 case AMDGPU::S_CMP_NLT_F32: {
7941 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7942 auto NewInstr =
7943 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7944 .setMIFlags(Inst.getFlags());
7945 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
7946 0) {
7947 NewInstr
7948 .addImm(0) // src0_modifiers
7949 .add(Inst.getOperand(0)) // src0
7950 .addImm(0) // src1_modifiers
7951 .add(Inst.getOperand(1)) // src1
7952 .addImm(0); // clamp
7953 } else {
7954 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
7955 }
7956 legalizeOperands(*NewInstr, MDT);
7957 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7958 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7959 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7960 Inst.eraseFromParent();
7961 return;
7962 }
7963 case AMDGPU::S_CMP_LT_F16:
7964 case AMDGPU::S_CMP_EQ_F16:
7965 case AMDGPU::S_CMP_LE_F16:
7966 case AMDGPU::S_CMP_GT_F16:
7967 case AMDGPU::S_CMP_LG_F16:
7968 case AMDGPU::S_CMP_GE_F16:
7969 case AMDGPU::S_CMP_O_F16:
7970 case AMDGPU::S_CMP_U_F16:
7971 case AMDGPU::S_CMP_NGE_F16:
7972 case AMDGPU::S_CMP_NLG_F16:
7973 case AMDGPU::S_CMP_NGT_F16:
7974 case AMDGPU::S_CMP_NLE_F16:
7975 case AMDGPU::S_CMP_NEQ_F16:
7976 case AMDGPU::S_CMP_NLT_F16: {
7977 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7978 auto NewInstr =
7979 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7980 .setMIFlags(Inst.getFlags());
7981 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
7982 NewInstr
7983 .addImm(0) // src0_modifiers
7984 .add(Inst.getOperand(0)) // src0
7985 .addImm(0) // src1_modifiers
7986 .add(Inst.getOperand(1)) // src1
7987 .addImm(0); // clamp
7988 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
7989 NewInstr.addImm(0); // op_sel0
7990 } else {
7991 NewInstr
7992 .add(Inst.getOperand(0))
7993 .add(Inst.getOperand(1));
7994 }
7995 legalizeOperandsVALUt16(*NewInstr, MRI);
7996 legalizeOperands(*NewInstr, MDT);
7997 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7998 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7999 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8000 Inst.eraseFromParent();
8001 return;
8002 }
8003 case AMDGPU::S_CVT_HI_F32_F16: {
8004 const DebugLoc &DL = Inst.getDebugLoc();
8005 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8006 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8007 if (ST.useRealTrue16Insts()) {
8008 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8009 .add(Inst.getOperand(1));
8010 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8011 .addImm(0) // src0_modifiers
8012 .addReg(TmpReg, 0, AMDGPU::hi16)
8013 .addImm(0) // clamp
8014 .addImm(0) // omod
8015 .addImm(0); // op_sel0
8016 } else {
8017 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8018 .addImm(16)
8019 .add(Inst.getOperand(1));
8020 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8021 .addImm(0) // src0_modifiers
8022 .addReg(TmpReg)
8023 .addImm(0) // clamp
8024 .addImm(0); // omod
8025 }
8026
8027 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8028 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8029 Inst.eraseFromParent();
8030 return;
8031 }
8032 case AMDGPU::S_MINIMUM_F32:
8033 case AMDGPU::S_MAXIMUM_F32: {
8034 const DebugLoc &DL = Inst.getDebugLoc();
8035 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8036 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8037 .addImm(0) // src0_modifiers
8038 .add(Inst.getOperand(1))
8039 .addImm(0) // src1_modifiers
8040 .add(Inst.getOperand(2))
8041 .addImm(0) // clamp
8042 .addImm(0); // omod
8043 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8044
8045 legalizeOperands(*NewInstr, MDT);
8046 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8047 Inst.eraseFromParent();
8048 return;
8049 }
8050 case AMDGPU::S_MINIMUM_F16:
8051 case AMDGPU::S_MAXIMUM_F16: {
8052 const DebugLoc &DL = Inst.getDebugLoc();
8053 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8054 ? &AMDGPU::VGPR_16RegClass
8055 : &AMDGPU::VGPR_32RegClass);
8056 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8057 .addImm(0) // src0_modifiers
8058 .add(Inst.getOperand(1))
8059 .addImm(0) // src1_modifiers
8060 .add(Inst.getOperand(2))
8061 .addImm(0) // clamp
8062 .addImm(0) // omod
8063 .addImm(0); // opsel0
8064 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8065 legalizeOperandsVALUt16(*NewInstr, MRI);
8066 legalizeOperands(*NewInstr, MDT);
8067 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8068 Inst.eraseFromParent();
8069 return;
8070 }
8071 case AMDGPU::V_S_EXP_F16_e64:
8072 case AMDGPU::V_S_LOG_F16_e64:
8073 case AMDGPU::V_S_RCP_F16_e64:
8074 case AMDGPU::V_S_RSQ_F16_e64:
8075 case AMDGPU::V_S_SQRT_F16_e64: {
8076 const DebugLoc &DL = Inst.getDebugLoc();
8077 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8078 ? &AMDGPU::VGPR_16RegClass
8079 : &AMDGPU::VGPR_32RegClass);
8080 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8081 .add(Inst.getOperand(1)) // src0_modifiers
8082 .add(Inst.getOperand(2))
8083 .add(Inst.getOperand(3)) // clamp
8084 .add(Inst.getOperand(4)) // omod
8085 .setMIFlags(Inst.getFlags());
8086 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8087 NewInstr.addImm(0); // opsel0
8088 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8089 legalizeOperandsVALUt16(*NewInstr, MRI);
8090 legalizeOperands(*NewInstr, MDT);
8091 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8092 Inst.eraseFromParent();
8093 return;
8094 }
8095 }
8096
8097 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8098 // We cannot move this instruction to the VALU, so we should try to
8099 // legalize its operands instead.
8100 legalizeOperands(Inst, MDT);
8101 return;
8102 }
8103 // Handle converting generic instructions like COPY-to-SGPR into
8104 // COPY-to-VGPR.
8105 if (NewOpcode == Opcode) {
8106 Register DstReg = Inst.getOperand(0).getReg();
8107 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8108
8109 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8110 // hope for the best.
8111 if (Inst.isCopy() && DstReg.isPhysical() &&
8112 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8113 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8114 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8115 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8116 .add(Inst.getOperand(1));
8117 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8118 DstReg)
8119 .addReg(NewDst);
8120
8121 Inst.eraseFromParent();
8122 return;
8123 }
8124
8125 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
8126 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
8127 // Instead of creating a copy where src and dst are the same register
8128 // class, we just replace all uses of dst with src. These kinds of
8129 // copies interfere with the heuristics MachineSink uses to decide
8130 // whether or not to split a critical edge. Since the pass assumes
8131 // that copies will end up as machine instructions and not be
8132 // eliminated.
8133 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8134 Register NewDstReg = Inst.getOperand(1).getReg();
8135 MRI.replaceRegWith(DstReg, NewDstReg);
8136 MRI.clearKillFlags(NewDstReg);
8137 Inst.getOperand(0).setReg(DstReg);
8138 Inst.eraseFromParent();
8139 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8140 for (MachineOperand &MO :
8141 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8142 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8143 }
8144 return;
8145 }
8146
8147 // If this is a v2s copy between 16bit and 32bit reg,
8148 // replace vgpr copy to reg_sequence/extract_subreg
8149 // This can be remove after we have sgpr16 in place
8150 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8151 Inst.getOperand(1).getReg().isVirtual() &&
8152 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8153 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8154 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8155 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8156 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8157 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8158 get(AMDGPU::IMPLICIT_DEF), Undef);
8159 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8160 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8161 .addReg(Inst.getOperand(1).getReg())
8162 .addImm(AMDGPU::lo16)
8163 .addReg(Undef)
8164 .addImm(AMDGPU::hi16);
8165 Inst.eraseFromParent();
8166 MRI.replaceRegWith(DstReg, NewDstReg);
8167 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8168 return;
8169 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8170 AMDGPU::lo16)) {
8171 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8172 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8173 MRI.replaceRegWith(DstReg, NewDstReg);
8174 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8175 return;
8176 }
8177 }
8178
8179 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8180 MRI.replaceRegWith(DstReg, NewDstReg);
8181 legalizeOperands(Inst, MDT);
8182 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8183 return;
8184 }
8185
8186 // Use the new VALU Opcode.
8187 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8188 .setMIFlags(Inst.getFlags());
8189 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8190 // Intersperse VOP3 modifiers among the SALU operands.
8191 NewInstr->addOperand(Inst.getOperand(0));
8192 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8193 AMDGPU::OpName::src0_modifiers) >= 0)
8194 NewInstr.addImm(0);
8195 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8196 MachineOperand Src = Inst.getOperand(1);
8197 NewInstr->addOperand(Src);
8198 }
8199
8200 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8201 // We are converting these to a BFE, so we need to add the missing
8202 // operands for the size and offset.
8203 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8204 NewInstr.addImm(0);
8205 NewInstr.addImm(Size);
8206 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8207 // The VALU version adds the second operand to the result, so insert an
8208 // extra 0 operand.
8209 NewInstr.addImm(0);
8210 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8211 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8212 // If we need to move this to VGPRs, we need to unpack the second
8213 // operand back into the 2 separate ones for bit offset and width.
8214 assert(OffsetWidthOp.isImm() &&
8215 "Scalar BFE is only implemented for constant width and offset");
8216 uint32_t Imm = OffsetWidthOp.getImm();
8217
8218 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8219 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8220 NewInstr.addImm(Offset);
8221 NewInstr.addImm(BitWidth);
8222 } else {
8223 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8224 AMDGPU::OpName::src1_modifiers) >= 0)
8225 NewInstr.addImm(0);
8226 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8227 NewInstr->addOperand(Inst.getOperand(2));
8228 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8229 AMDGPU::OpName::src2_modifiers) >= 0)
8230 NewInstr.addImm(0);
8231 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8232 NewInstr->addOperand(Inst.getOperand(3));
8233 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8234 NewInstr.addImm(0);
8235 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8236 NewInstr.addImm(0);
8237 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8238 NewInstr.addImm(0);
8239 }
8240 } else {
8241 // Just copy the SALU operands.
8242 for (const MachineOperand &Op : Inst.explicit_operands())
8243 NewInstr->addOperand(Op);
8244 }
8245
8246 // Remove any references to SCC. Vector instructions can't read from it, and
8247 // We're just about to add the implicit use / defs of VCC, and we don't want
8248 // both.
8249 for (MachineOperand &Op : Inst.implicit_operands()) {
8250 if (Op.getReg() == AMDGPU::SCC) {
8251 // Only propagate through live-def of SCC.
8252 if (Op.isDef() && !Op.isDead())
8253 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8254 if (Op.isUse())
8255 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8256 }
8257 }
8258 Inst.eraseFromParent();
8259 Register NewDstReg;
8260 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8261 Register DstReg = NewInstr->getOperand(0).getReg();
8262 assert(DstReg.isVirtual());
8263 // Update the destination register class.
8264 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8265 assert(NewDstRC);
8266 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8267 MRI.replaceRegWith(DstReg, NewDstReg);
8268 }
8269 fixImplicitOperands(*NewInstr);
8270
8271 legalizeOperandsVALUt16(*NewInstr, MRI);
8272
8273 // Legalize the operands
8274 legalizeOperands(*NewInstr, MDT);
8275 if (NewDstReg)
8276 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8277}
8278
8279// Add/sub require special handling to deal with carry outs.
8280std::pair<bool, MachineBasicBlock *>
8281SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8282 MachineDominatorTree *MDT) const {
8283 if (ST.hasAddNoCarry()) {
8284 // Assume there is no user of scc since we don't select this in that case.
8285 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8286 // is used.
8287
8288 MachineBasicBlock &MBB = *Inst.getParent();
8289 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8290
8291 Register OldDstReg = Inst.getOperand(0).getReg();
8292 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8293
8294 unsigned Opc = Inst.getOpcode();
8295 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8296
8297 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8298 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8299
8300 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8301 Inst.removeOperand(3);
8302
8303 Inst.setDesc(get(NewOpc));
8304 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8305 Inst.addImplicitDefUseOperands(*MBB.getParent());
8306 MRI.replaceRegWith(OldDstReg, ResultReg);
8307 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8308
8309 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8310 return std::pair(true, NewBB);
8311 }
8312
8313 return std::pair(false, nullptr);
8314}
8315
8316void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8317 MachineDominatorTree *MDT) const {
8318
8319 MachineBasicBlock &MBB = *Inst.getParent();
8320 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8321 MachineBasicBlock::iterator MII = Inst;
8322 DebugLoc DL = Inst.getDebugLoc();
8323
8324 MachineOperand &Dest = Inst.getOperand(0);
8325 MachineOperand &Src0 = Inst.getOperand(1);
8326 MachineOperand &Src1 = Inst.getOperand(2);
8327 MachineOperand &Cond = Inst.getOperand(3);
8328
8329 Register CondReg = Cond.getReg();
8330 bool IsSCC = (CondReg == AMDGPU::SCC);
8331
8332 // If this is a trivial select where the condition is effectively not SCC
8333 // (CondReg is a source of copy to SCC), then the select is semantically
8334 // equivalent to copying CondReg. Hence, there is no need to create
8335 // V_CNDMASK, we can just use that and bail out.
8336 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8337 (Src1.getImm() == 0)) {
8338 MRI.replaceRegWith(Dest.getReg(), CondReg);
8339 return;
8340 }
8341
8342 Register NewCondReg = CondReg;
8343 if (IsSCC) {
8344 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8345 NewCondReg = MRI.createVirtualRegister(TC);
8346
8347 // Now look for the closest SCC def if it is a copy
8348 // replacing the CondReg with the COPY source register
8349 bool CopyFound = false;
8350 for (MachineInstr &CandI :
8352 Inst.getParent()->rend())) {
8353 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8354 -1) {
8355 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8356 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8357 .addReg(CandI.getOperand(1).getReg());
8358 CopyFound = true;
8359 }
8360 break;
8361 }
8362 }
8363 if (!CopyFound) {
8364 // SCC def is not a copy
8365 // Insert a trivial select instead of creating a copy, because a copy from
8366 // SCC would semantically mean just copying a single bit, but we may need
8367 // the result to be a vector condition mask that needs preserving.
8368 unsigned Opcode =
8369 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8370 auto NewSelect =
8371 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8372 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8373 }
8374 }
8375
8376 Register NewDestReg = MRI.createVirtualRegister(
8377 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8378 MachineInstr *NewInst;
8379 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8380 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8381 .addImm(0)
8382 .add(Src1) // False
8383 .addImm(0)
8384 .add(Src0) // True
8385 .addReg(NewCondReg);
8386 } else {
8387 NewInst =
8388 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8389 .add(Src1) // False
8390 .add(Src0) // True
8391 .addReg(NewCondReg);
8392 }
8393 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8394 legalizeOperands(*NewInst, MDT);
8395 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8396}
8397
8398void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8399 MachineInstr &Inst) const {
8400 MachineBasicBlock &MBB = *Inst.getParent();
8401 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8402 MachineBasicBlock::iterator MII = Inst;
8403 DebugLoc DL = Inst.getDebugLoc();
8404
8405 MachineOperand &Dest = Inst.getOperand(0);
8406 MachineOperand &Src = Inst.getOperand(1);
8407 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8408 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8409
8410 unsigned SubOp = ST.hasAddNoCarry() ?
8411 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8412
8413 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8414 .addImm(0)
8415 .addReg(Src.getReg());
8416
8417 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8418 .addReg(Src.getReg())
8419 .addReg(TmpReg);
8420
8421 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8422 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8423}
8424
8425void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8426 MachineInstr &Inst) const {
8427 MachineBasicBlock &MBB = *Inst.getParent();
8428 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8429 MachineBasicBlock::iterator MII = Inst;
8430 const DebugLoc &DL = Inst.getDebugLoc();
8431
8432 MachineOperand &Dest = Inst.getOperand(0);
8433 MachineOperand &Src0 = Inst.getOperand(1);
8434 MachineOperand &Src1 = Inst.getOperand(2);
8435
8436 if (ST.hasDLInsts()) {
8437 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8438 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8439 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8440
8441 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8442 .add(Src0)
8443 .add(Src1);
8444
8445 MRI.replaceRegWith(Dest.getReg(), NewDest);
8446 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8447 } else {
8448 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8449 // invert either source and then perform the XOR. If either source is a
8450 // scalar register, then we can leave the inversion on the scalar unit to
8451 // achieve a better distribution of scalar and vector instructions.
8452 bool Src0IsSGPR = Src0.isReg() &&
8453 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8454 bool Src1IsSGPR = Src1.isReg() &&
8455 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8456 MachineInstr *Xor;
8457 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8458 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8459
8460 // Build a pair of scalar instructions and add them to the work list.
8461 // The next iteration over the work list will lower these to the vector
8462 // unit as necessary.
8463 if (Src0IsSGPR) {
8464 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8465 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8466 .addReg(Temp)
8467 .add(Src1);
8468 } else if (Src1IsSGPR) {
8469 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8470 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8471 .add(Src0)
8472 .addReg(Temp);
8473 } else {
8474 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8475 .add(Src0)
8476 .add(Src1);
8477 MachineInstr *Not =
8478 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8479 Worklist.insert(Not);
8480 }
8481
8482 MRI.replaceRegWith(Dest.getReg(), NewDest);
8483
8484 Worklist.insert(Xor);
8485
8486 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8487 }
8488}
8489
8490void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8491 MachineInstr &Inst,
8492 unsigned Opcode) const {
8493 MachineBasicBlock &MBB = *Inst.getParent();
8494 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8495 MachineBasicBlock::iterator MII = Inst;
8496 const DebugLoc &DL = Inst.getDebugLoc();
8497
8498 MachineOperand &Dest = Inst.getOperand(0);
8499 MachineOperand &Src0 = Inst.getOperand(1);
8500 MachineOperand &Src1 = Inst.getOperand(2);
8501
8502 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8503 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8504
8505 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8506 .add(Src0)
8507 .add(Src1);
8508
8509 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8510 .addReg(Interm);
8511
8512 Worklist.insert(&Op);
8513 Worklist.insert(&Not);
8514
8515 MRI.replaceRegWith(Dest.getReg(), NewDest);
8516 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8517}
8518
8519void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8520 MachineInstr &Inst,
8521 unsigned Opcode) const {
8522 MachineBasicBlock &MBB = *Inst.getParent();
8523 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8524 MachineBasicBlock::iterator MII = Inst;
8525 const DebugLoc &DL = Inst.getDebugLoc();
8526
8527 MachineOperand &Dest = Inst.getOperand(0);
8528 MachineOperand &Src0 = Inst.getOperand(1);
8529 MachineOperand &Src1 = Inst.getOperand(2);
8530
8531 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8532 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8533
8534 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8535 .add(Src1);
8536
8537 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8538 .add(Src0)
8539 .addReg(Interm);
8540
8541 Worklist.insert(&Not);
8542 Worklist.insert(&Op);
8543
8544 MRI.replaceRegWith(Dest.getReg(), NewDest);
8545 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8546}
8547
8548void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8549 MachineInstr &Inst, unsigned Opcode,
8550 bool Swap) const {
8551 MachineBasicBlock &MBB = *Inst.getParent();
8552 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8553
8554 MachineOperand &Dest = Inst.getOperand(0);
8555 MachineOperand &Src0 = Inst.getOperand(1);
8556 DebugLoc DL = Inst.getDebugLoc();
8557
8558 MachineBasicBlock::iterator MII = Inst;
8559
8560 const MCInstrDesc &InstDesc = get(Opcode);
8561 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8562 MRI.getRegClass(Src0.getReg()) :
8563 &AMDGPU::SGPR_32RegClass;
8564
8565 const TargetRegisterClass *Src0SubRC =
8566 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8567
8568 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8569 AMDGPU::sub0, Src0SubRC);
8570
8571 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8572 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8573 const TargetRegisterClass *NewDestSubRC =
8574 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8575
8576 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8577 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8578
8579 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8580 AMDGPU::sub1, Src0SubRC);
8581
8582 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8583 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8584
8585 if (Swap)
8586 std::swap(DestSub0, DestSub1);
8587
8588 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8589 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8590 .addReg(DestSub0)
8591 .addImm(AMDGPU::sub0)
8592 .addReg(DestSub1)
8593 .addImm(AMDGPU::sub1);
8594
8595 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8596
8597 Worklist.insert(&LoHalf);
8598 Worklist.insert(&HiHalf);
8599
8600 // We don't need to legalizeOperands here because for a single operand, src0
8601 // will support any kind of input.
8602
8603 // Move all users of this moved value.
8604 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8605}
8606
8607// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8608// split the s_mul_u64 in 32-bit vector multiplications.
8609void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8610 MachineInstr &Inst,
8611 MachineDominatorTree *MDT) const {
8612 MachineBasicBlock &MBB = *Inst.getParent();
8613 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8614
8615 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8616 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8617 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8618
8619 MachineOperand &Dest = Inst.getOperand(0);
8620 MachineOperand &Src0 = Inst.getOperand(1);
8621 MachineOperand &Src1 = Inst.getOperand(2);
8622 const DebugLoc &DL = Inst.getDebugLoc();
8623 MachineBasicBlock::iterator MII = Inst;
8624
8625 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8626 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8627 const TargetRegisterClass *Src0SubRC =
8628 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8629 if (RI.isSGPRClass(Src0SubRC))
8630 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8631 const TargetRegisterClass *Src1SubRC =
8632 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8633 if (RI.isSGPRClass(Src1SubRC))
8634 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8635
8636 // First, we extract the low 32-bit and high 32-bit values from each of the
8637 // operands.
8638 MachineOperand Op0L =
8639 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8640 MachineOperand Op1L =
8641 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8642 MachineOperand Op0H =
8643 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8644 MachineOperand Op1H =
8645 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8646
8647 // The multilication is done as follows:
8648 //
8649 // Op1H Op1L
8650 // * Op0H Op0L
8651 // --------------------
8652 // Op1H*Op0L Op1L*Op0L
8653 // + Op1H*Op0H Op1L*Op0H
8654 // -----------------------------------------
8655 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8656 //
8657 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8658 // value and that would overflow.
8659 // The low 32-bit value is Op1L*Op0L.
8660 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8661
8662 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8663 MachineInstr *Op1L_Op0H =
8664 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8665 .add(Op1L)
8666 .add(Op0H);
8667
8668 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8669 MachineInstr *Op1H_Op0L =
8670 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8671 .add(Op1H)
8672 .add(Op0L);
8673
8674 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8675 MachineInstr *Carry =
8676 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8677 .add(Op1L)
8678 .add(Op0L);
8679
8680 MachineInstr *LoHalf =
8681 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8682 .add(Op1L)
8683 .add(Op0L);
8684
8685 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8686 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8687 .addReg(Op1L_Op0H_Reg)
8688 .addReg(Op1H_Op0L_Reg);
8689
8690 MachineInstr *HiHalf =
8691 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8692 .addReg(AddReg)
8693 .addReg(CarryReg);
8694
8695 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8696 .addReg(DestSub0)
8697 .addImm(AMDGPU::sub0)
8698 .addReg(DestSub1)
8699 .addImm(AMDGPU::sub1);
8700
8701 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8702
8703 // Try to legalize the operands in case we need to swap the order to keep it
8704 // valid.
8705 legalizeOperands(*Op1L_Op0H, MDT);
8706 legalizeOperands(*Op1H_Op0L, MDT);
8707 legalizeOperands(*Carry, MDT);
8708 legalizeOperands(*LoHalf, MDT);
8709 legalizeOperands(*Add, MDT);
8710 legalizeOperands(*HiHalf, MDT);
8711
8712 // Move all users of this moved value.
8713 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8714}
8715
8716// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8717// multiplications.
8718void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8719 MachineInstr &Inst,
8720 MachineDominatorTree *MDT) const {
8721 MachineBasicBlock &MBB = *Inst.getParent();
8722 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8723
8724 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8725 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8726 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8727
8728 MachineOperand &Dest = Inst.getOperand(0);
8729 MachineOperand &Src0 = Inst.getOperand(1);
8730 MachineOperand &Src1 = Inst.getOperand(2);
8731 const DebugLoc &DL = Inst.getDebugLoc();
8732 MachineBasicBlock::iterator MII = Inst;
8733
8734 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8735 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8736 const TargetRegisterClass *Src0SubRC =
8737 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8738 if (RI.isSGPRClass(Src0SubRC))
8739 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8740 const TargetRegisterClass *Src1SubRC =
8741 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8742 if (RI.isSGPRClass(Src1SubRC))
8743 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8744
8745 // First, we extract the low 32-bit and high 32-bit values from each of the
8746 // operands.
8747 MachineOperand Op0L =
8748 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8749 MachineOperand Op1L =
8750 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8751
8752 unsigned Opc = Inst.getOpcode();
8753 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8754 ? AMDGPU::V_MUL_HI_U32_e64
8755 : AMDGPU::V_MUL_HI_I32_e64;
8756 MachineInstr *HiHalf =
8757 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8758
8759 MachineInstr *LoHalf =
8760 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8761 .add(Op1L)
8762 .add(Op0L);
8763
8764 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8765 .addReg(DestSub0)
8766 .addImm(AMDGPU::sub0)
8767 .addReg(DestSub1)
8768 .addImm(AMDGPU::sub1);
8769
8770 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8771
8772 // Try to legalize the operands in case we need to swap the order to keep it
8773 // valid.
8774 legalizeOperands(*HiHalf, MDT);
8775 legalizeOperands(*LoHalf, MDT);
8776
8777 // Move all users of this moved value.
8778 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8779}
8780
8781void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8782 MachineInstr &Inst, unsigned Opcode,
8783 MachineDominatorTree *MDT) const {
8784 MachineBasicBlock &MBB = *Inst.getParent();
8785 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8786
8787 MachineOperand &Dest = Inst.getOperand(0);
8788 MachineOperand &Src0 = Inst.getOperand(1);
8789 MachineOperand &Src1 = Inst.getOperand(2);
8790 DebugLoc DL = Inst.getDebugLoc();
8791
8792 MachineBasicBlock::iterator MII = Inst;
8793
8794 const MCInstrDesc &InstDesc = get(Opcode);
8795 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8796 MRI.getRegClass(Src0.getReg()) :
8797 &AMDGPU::SGPR_32RegClass;
8798
8799 const TargetRegisterClass *Src0SubRC =
8800 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8801 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8802 MRI.getRegClass(Src1.getReg()) :
8803 &AMDGPU::SGPR_32RegClass;
8804
8805 const TargetRegisterClass *Src1SubRC =
8806 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8807
8808 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8809 AMDGPU::sub0, Src0SubRC);
8810 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8811 AMDGPU::sub0, Src1SubRC);
8812 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8813 AMDGPU::sub1, Src0SubRC);
8814 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8815 AMDGPU::sub1, Src1SubRC);
8816
8817 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8818 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8819 const TargetRegisterClass *NewDestSubRC =
8820 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8821
8822 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8823 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8824 .add(SrcReg0Sub0)
8825 .add(SrcReg1Sub0);
8826
8827 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8828 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8829 .add(SrcReg0Sub1)
8830 .add(SrcReg1Sub1);
8831
8832 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8833 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8834 .addReg(DestSub0)
8835 .addImm(AMDGPU::sub0)
8836 .addReg(DestSub1)
8837 .addImm(AMDGPU::sub1);
8838
8839 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8840
8841 Worklist.insert(&LoHalf);
8842 Worklist.insert(&HiHalf);
8843
8844 // Move all users of this moved value.
8845 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8846}
8847
8848void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8849 MachineInstr &Inst,
8850 MachineDominatorTree *MDT) const {
8851 MachineBasicBlock &MBB = *Inst.getParent();
8852 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8853
8854 MachineOperand &Dest = Inst.getOperand(0);
8855 MachineOperand &Src0 = Inst.getOperand(1);
8856 MachineOperand &Src1 = Inst.getOperand(2);
8857 const DebugLoc &DL = Inst.getDebugLoc();
8858
8859 MachineBasicBlock::iterator MII = Inst;
8860
8861 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8862
8863 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8864
8865 MachineOperand* Op0;
8866 MachineOperand* Op1;
8867
8868 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8869 Op0 = &Src0;
8870 Op1 = &Src1;
8871 } else {
8872 Op0 = &Src1;
8873 Op1 = &Src0;
8874 }
8875
8876 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8877 .add(*Op0);
8878
8879 Register NewDest = MRI.createVirtualRegister(DestRC);
8880
8881 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8882 .addReg(Interm)
8883 .add(*Op1);
8884
8885 MRI.replaceRegWith(Dest.getReg(), NewDest);
8886
8887 Worklist.insert(&Xor);
8888}
8889
8890void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8891 MachineInstr &Inst) const {
8892 MachineBasicBlock &MBB = *Inst.getParent();
8893 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8894
8895 MachineBasicBlock::iterator MII = Inst;
8896 const DebugLoc &DL = Inst.getDebugLoc();
8897
8898 MachineOperand &Dest = Inst.getOperand(0);
8899 MachineOperand &Src = Inst.getOperand(1);
8900
8901 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8902 const TargetRegisterClass *SrcRC = Src.isReg() ?
8903 MRI.getRegClass(Src.getReg()) :
8904 &AMDGPU::SGPR_32RegClass;
8905
8906 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8907 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8908
8909 const TargetRegisterClass *SrcSubRC =
8910 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8911
8912 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8913 AMDGPU::sub0, SrcSubRC);
8914 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8915 AMDGPU::sub1, SrcSubRC);
8916
8917 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8918
8919 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8920
8921 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8922
8923 // We don't need to legalize operands here. src0 for either instruction can be
8924 // an SGPR, and the second input is unused or determined here.
8925 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8926}
8927
8928void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8929 MachineInstr &Inst) const {
8930 MachineBasicBlock &MBB = *Inst.getParent();
8931 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8932 MachineBasicBlock::iterator MII = Inst;
8933 const DebugLoc &DL = Inst.getDebugLoc();
8934
8935 MachineOperand &Dest = Inst.getOperand(0);
8936 uint32_t Imm = Inst.getOperand(2).getImm();
8937 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8938 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8939
8940 (void) Offset;
8941
8942 // Only sext_inreg cases handled.
8943 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8944 Offset == 0 && "Not implemented");
8945
8946 if (BitWidth < 32) {
8947 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8948 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8949 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8950
8951 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8952 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8953 .addImm(0)
8954 .addImm(BitWidth);
8955
8956 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8957 .addImm(31)
8958 .addReg(MidRegLo);
8959
8960 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8961 .addReg(MidRegLo)
8962 .addImm(AMDGPU::sub0)
8963 .addReg(MidRegHi)
8964 .addImm(AMDGPU::sub1);
8965
8966 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8967 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8968 return;
8969 }
8970
8971 MachineOperand &Src = Inst.getOperand(1);
8972 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8973 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8974
8975 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8976 .addImm(31)
8977 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8978
8979 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8980 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8981 .addImm(AMDGPU::sub0)
8982 .addReg(TmpReg)
8983 .addImm(AMDGPU::sub1);
8984
8985 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8986 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8987}
8988
8989void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8990 MachineInstr &Inst, unsigned Opcode,
8991 MachineDominatorTree *MDT) const {
8992 // (S_FLBIT_I32_B64 hi:lo) ->
8993 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8994 // (S_FF1_I32_B64 hi:lo) ->
8995 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8996
8997 MachineBasicBlock &MBB = *Inst.getParent();
8998 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8999 MachineBasicBlock::iterator MII = Inst;
9000 const DebugLoc &DL = Inst.getDebugLoc();
9001
9002 MachineOperand &Dest = Inst.getOperand(0);
9003 MachineOperand &Src = Inst.getOperand(1);
9004
9005 const MCInstrDesc &InstDesc = get(Opcode);
9006
9007 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9008 unsigned OpcodeAdd =
9009 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
9010
9011 const TargetRegisterClass *SrcRC =
9012 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9013 const TargetRegisterClass *SrcSubRC =
9014 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9015
9016 MachineOperand SrcRegSub0 =
9017 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9018 MachineOperand SrcRegSub1 =
9019 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9020
9021 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9022 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9023 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9024 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9025
9026 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9027
9028 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9029
9030 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9031 .addReg(IsCtlz ? MidReg1 : MidReg2)
9032 .addImm(32)
9033 .addImm(1); // enable clamp
9034
9035 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9036 .addReg(MidReg3)
9037 .addReg(IsCtlz ? MidReg2 : MidReg1);
9038
9039 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9040
9041 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9042}
9043
9044void SIInstrInfo::addUsersToMoveToVALUWorklist(
9046 SIInstrWorklist &Worklist) const {
9047 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9048 MachineInstr &UseMI = *MO.getParent();
9049
9050 unsigned OpNo = 0;
9051
9052 switch (UseMI.getOpcode()) {
9053 case AMDGPU::COPY:
9054 case AMDGPU::WQM:
9055 case AMDGPU::SOFT_WQM:
9056 case AMDGPU::STRICT_WWM:
9057 case AMDGPU::STRICT_WQM:
9058 case AMDGPU::REG_SEQUENCE:
9059 case AMDGPU::PHI:
9060 case AMDGPU::INSERT_SUBREG:
9061 break;
9062 default:
9063 OpNo = MO.getOperandNo();
9064 break;
9065 }
9066
9067 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9068 MRI.constrainRegClass(DstReg, OpRC);
9069
9070 if (!RI.hasVectorRegisters(OpRC))
9071 Worklist.insert(&UseMI);
9072 else
9073 // Legalization could change user list.
9075 }
9076}
9077
9078void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9080 MachineInstr &Inst) const {
9081 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9082 MachineBasicBlock *MBB = Inst.getParent();
9083 MachineOperand &Src0 = Inst.getOperand(1);
9084 MachineOperand &Src1 = Inst.getOperand(2);
9085 const DebugLoc &DL = Inst.getDebugLoc();
9086
9087 switch (Inst.getOpcode()) {
9088 case AMDGPU::S_PACK_LL_B32_B16: {
9089 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9090 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9091
9092 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9093 // 0.
9094 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9095 .addImm(0xffff);
9096
9097 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9098 .addReg(ImmReg, RegState::Kill)
9099 .add(Src0);
9100
9101 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9102 .add(Src1)
9103 .addImm(16)
9104 .addReg(TmpReg, RegState::Kill);
9105 break;
9106 }
9107 case AMDGPU::S_PACK_LH_B32_B16: {
9108 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9109 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9110 .addImm(0xffff);
9111 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9112 .addReg(ImmReg, RegState::Kill)
9113 .add(Src0)
9114 .add(Src1);
9115 break;
9116 }
9117 case AMDGPU::S_PACK_HL_B32_B16: {
9118 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9119 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9120 .addImm(16)
9121 .add(Src0);
9122 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9123 .add(Src1)
9124 .addImm(16)
9125 .addReg(TmpReg, RegState::Kill);
9126 break;
9127 }
9128 case AMDGPU::S_PACK_HH_B32_B16: {
9129 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9130 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9131 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9132 .addImm(16)
9133 .add(Src0);
9134 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9135 .addImm(0xffff0000);
9136 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9137 .add(Src1)
9138 .addReg(ImmReg, RegState::Kill)
9139 .addReg(TmpReg, RegState::Kill);
9140 break;
9141 }
9142 default:
9143 llvm_unreachable("unhandled s_pack_* instruction");
9144 }
9145
9146 MachineOperand &Dest = Inst.getOperand(0);
9147 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9148 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9149}
9150
9151void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
9152 MachineInstr &SCCDefInst,
9153 SIInstrWorklist &Worklist,
9154 Register NewCond) const {
9155
9156 // Ensure that def inst defines SCC, which is still live.
9157 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9158 !Op.isDead() && Op.getParent() == &SCCDefInst);
9159 SmallVector<MachineInstr *, 4> CopyToDelete;
9160 // This assumes that all the users of SCC are in the same block
9161 // as the SCC def.
9162 for (MachineInstr &MI : // Skip the def inst itself.
9163 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9164 SCCDefInst.getParent()->end())) {
9165 // Check if SCC is used first.
9166 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9167 if (SCCIdx != -1) {
9168 if (MI.isCopy()) {
9169 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9170 Register DestReg = MI.getOperand(0).getReg();
9171
9172 MRI.replaceRegWith(DestReg, NewCond);
9173 CopyToDelete.push_back(&MI);
9174 } else {
9175
9176 if (NewCond.isValid())
9177 MI.getOperand(SCCIdx).setReg(NewCond);
9178
9179 Worklist.insert(&MI);
9180 }
9181 }
9182 // Exit if we find another SCC def.
9183 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9184 break;
9185 }
9186 for (auto &Copy : CopyToDelete)
9187 Copy->eraseFromParent();
9188}
9189
9190// Instructions that use SCC may be converted to VALU instructions. When that
9191// happens, the SCC register is changed to VCC_LO. The instruction that defines
9192// SCC must be changed to an instruction that defines VCC. This function makes
9193// sure that the instruction that defines SCC is added to the moveToVALU
9194// worklist.
9195void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9196 SIInstrWorklist &Worklist) const {
9197 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9198 // then there is nothing to do because the defining instruction has been
9199 // converted to a VALU already. If SCC then that instruction needs to be
9200 // converted to a VALU.
9201 for (MachineInstr &MI :
9202 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9203 SCCUseInst->getParent()->rend())) {
9204 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9205 break;
9206 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9207 Worklist.insert(&MI);
9208 break;
9209 }
9210 }
9211}
9212
9213const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9214 const MachineInstr &Inst) const {
9215 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9216
9217 switch (Inst.getOpcode()) {
9218 // For target instructions, getOpRegClass just returns the virtual register
9219 // class associated with the operand, so we need to find an equivalent VGPR
9220 // register class in order to move the instruction to the VALU.
9221 case AMDGPU::COPY:
9222 case AMDGPU::PHI:
9223 case AMDGPU::REG_SEQUENCE:
9224 case AMDGPU::INSERT_SUBREG:
9225 case AMDGPU::WQM:
9226 case AMDGPU::SOFT_WQM:
9227 case AMDGPU::STRICT_WWM:
9228 case AMDGPU::STRICT_WQM: {
9229 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9230 if (RI.isAGPRClass(SrcRC)) {
9231 if (RI.isAGPRClass(NewDstRC))
9232 return nullptr;
9233
9234 switch (Inst.getOpcode()) {
9235 case AMDGPU::PHI:
9236 case AMDGPU::REG_SEQUENCE:
9237 case AMDGPU::INSERT_SUBREG:
9238 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9239 break;
9240 default:
9241 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9242 }
9243
9244 if (!NewDstRC)
9245 return nullptr;
9246 } else {
9247 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9248 return nullptr;
9249
9250 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9251 if (!NewDstRC)
9252 return nullptr;
9253 }
9254
9255 return NewDstRC;
9256 }
9257 default:
9258 return NewDstRC;
9259 }
9260}
9261
9262// Find the one SGPR operand we are allowed to use.
9263Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9264 int OpIndices[3]) const {
9265 const MCInstrDesc &Desc = MI.getDesc();
9266
9267 // Find the one SGPR operand we are allowed to use.
9268 //
9269 // First we need to consider the instruction's operand requirements before
9270 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9271 // of VCC, but we are still bound by the constant bus requirement to only use
9272 // one.
9273 //
9274 // If the operand's class is an SGPR, we can never move it.
9275
9276 Register SGPRReg = findImplicitSGPRRead(MI);
9277 if (SGPRReg)
9278 return SGPRReg;
9279
9280 Register UsedSGPRs[3] = {Register()};
9281 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9282
9283 for (unsigned i = 0; i < 3; ++i) {
9284 int Idx = OpIndices[i];
9285 if (Idx == -1)
9286 break;
9287
9288 const MachineOperand &MO = MI.getOperand(Idx);
9289 if (!MO.isReg())
9290 continue;
9291
9292 // Is this operand statically required to be an SGPR based on the operand
9293 // constraints?
9294 const TargetRegisterClass *OpRC =
9295 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9296 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9297 if (IsRequiredSGPR)
9298 return MO.getReg();
9299
9300 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9301 Register Reg = MO.getReg();
9302 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9303 if (RI.isSGPRClass(RegRC))
9304 UsedSGPRs[i] = Reg;
9305 }
9306
9307 // We don't have a required SGPR operand, so we have a bit more freedom in
9308 // selecting operands to move.
9309
9310 // Try to select the most used SGPR. If an SGPR is equal to one of the
9311 // others, we choose that.
9312 //
9313 // e.g.
9314 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9315 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9316
9317 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9318 // prefer those.
9319
9320 if (UsedSGPRs[0]) {
9321 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9322 SGPRReg = UsedSGPRs[0];
9323 }
9324
9325 if (!SGPRReg && UsedSGPRs[1]) {
9326 if (UsedSGPRs[1] == UsedSGPRs[2])
9327 SGPRReg = UsedSGPRs[1];
9328 }
9329
9330 return SGPRReg;
9331}
9332
9334 AMDGPU::OpName OperandName) const {
9335 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9336 return nullptr;
9337
9338 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9339 if (Idx == -1)
9340 return nullptr;
9341
9342 return &MI.getOperand(Idx);
9343}
9344
9346 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9347 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9350 return (Format << 44) |
9351 (1ULL << 56) | // RESOURCE_LEVEL = 1
9352 (3ULL << 60); // OOB_SELECT = 3
9353 }
9354
9355 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9356 if (ST.isAmdHsaOS()) {
9357 // Set ATC = 1. GFX9 doesn't have this bit.
9358 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9359 RsrcDataFormat |= (1ULL << 56);
9360
9361 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9362 // BTW, it disables TC L2 and therefore decreases performance.
9363 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9364 RsrcDataFormat |= (2ULL << 59);
9365 }
9366
9367 return RsrcDataFormat;
9368}
9369
9373 0xffffffff; // Size;
9374
9375 // GFX9 doesn't have ELEMENT_SIZE.
9376 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9377 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9378 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9379 }
9380
9381 // IndexStride = 64 / 32.
9382 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9383 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9384
9385 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9386 // Clear them unless we want a huge stride.
9387 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9388 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9389 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9390
9391 return Rsrc23;
9392}
9393
9395 unsigned Opc = MI.getOpcode();
9396
9397 return isSMRD(Opc);
9398}
9399
9401 return get(Opc).mayLoad() &&
9402 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9403}
9404
9406 int &FrameIndex) const {
9407 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9408 if (!Addr || !Addr->isFI())
9409 return Register();
9410
9411 assert(!MI.memoperands_empty() &&
9412 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9413
9414 FrameIndex = Addr->getIndex();
9415 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9416}
9417
9419 int &FrameIndex) const {
9420 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9421 assert(Addr && Addr->isFI());
9422 FrameIndex = Addr->getIndex();
9423 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9424}
9425
9427 int &FrameIndex) const {
9428 if (!MI.mayLoad())
9429 return Register();
9430
9431 if (isMUBUF(MI) || isVGPRSpill(MI))
9432 return isStackAccess(MI, FrameIndex);
9433
9434 if (isSGPRSpill(MI))
9435 return isSGPRStackAccess(MI, FrameIndex);
9436
9437 return Register();
9438}
9439
9441 int &FrameIndex) const {
9442 if (!MI.mayStore())
9443 return Register();
9444
9445 if (isMUBUF(MI) || isVGPRSpill(MI))
9446 return isStackAccess(MI, FrameIndex);
9447
9448 if (isSGPRSpill(MI))
9449 return isSGPRStackAccess(MI, FrameIndex);
9450
9451 return Register();
9452}
9453
9455 unsigned Size = 0;
9457 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9458 while (++I != E && I->isInsideBundle()) {
9459 assert(!I->isBundle() && "No nested bundle!");
9461 }
9462
9463 return Size;
9464}
9465
9467 unsigned Opc = MI.getOpcode();
9469 unsigned DescSize = Desc.getSize();
9470
9471 // If we have a definitive size, we can use it. Otherwise we need to inspect
9472 // the operands to know the size.
9473 if (isFixedSize(MI)) {
9474 unsigned Size = DescSize;
9475
9476 // If we hit the buggy offset, an extra nop will be inserted in MC so
9477 // estimate the worst case.
9478 if (MI.isBranch() && ST.hasOffset3fBug())
9479 Size += 4;
9480
9481 return Size;
9482 }
9483
9484 // Instructions may have a 32-bit literal encoded after them. Check
9485 // operands that could ever be literals.
9486 if (isVALU(MI) || isSALU(MI)) {
9487 if (isDPP(MI))
9488 return DescSize;
9489 bool HasLiteral = false;
9490 unsigned LiteralSize = 4;
9491 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9492 const MachineOperand &Op = MI.getOperand(I);
9493 const MCOperandInfo &OpInfo = Desc.operands()[I];
9494 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9495 HasLiteral = true;
9496 if (ST.has64BitLiterals()) {
9497 switch (OpInfo.OperandType) {
9498 default:
9499 break;
9501 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9502 LiteralSize = 8;
9503 break;
9505 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9506 LiteralSize = 8;
9507 break;
9508 }
9509 }
9510 break;
9511 }
9512 }
9513 return HasLiteral ? DescSize + LiteralSize : DescSize;
9514 }
9515
9516 // Check whether we have extra NSA words.
9517 if (isMIMG(MI)) {
9518 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9519 if (VAddr0Idx < 0)
9520 return 8;
9521
9522 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9523 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9524 }
9525
9526 switch (Opc) {
9527 case TargetOpcode::BUNDLE:
9528 return getInstBundleSize(MI);
9529 case TargetOpcode::INLINEASM:
9530 case TargetOpcode::INLINEASM_BR: {
9531 const MachineFunction *MF = MI.getParent()->getParent();
9532 const char *AsmStr = MI.getOperand(0).getSymbolName();
9533 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9534 }
9535 default:
9536 if (MI.isMetaInstruction())
9537 return 0;
9538
9539 // If D16 Pseudo inst, get correct MC code size
9540 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9541 if (D16Info) {
9542 // Assume d16_lo/hi inst are always in same size
9543 unsigned LoInstOpcode = D16Info->LoOp;
9544 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9545 DescSize = Desc.getSize();
9546 }
9547
9548 // If FMA Pseudo inst, get correct MC code size
9549 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9550 // All potential lowerings are the same size; arbitrarily pick one.
9551 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9552 DescSize = Desc.getSize();
9553 }
9554
9555 return DescSize;
9556 }
9557}
9558
9560 if (!isFLAT(MI))
9561 return false;
9562
9563 if (MI.memoperands_empty())
9564 return true;
9565
9566 for (const MachineMemOperand *MMO : MI.memoperands()) {
9567 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9568 return true;
9569 }
9570 return false;
9571}
9572
9575 static const std::pair<int, const char *> TargetIndices[] = {
9576 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9577 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9578 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9579 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9580 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9581 return ArrayRef(TargetIndices);
9582}
9583
9584/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9585/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9591
9592/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9593/// pass.
9598
9599// Called during:
9600// - pre-RA scheduling and post-RA scheduling
9603 const ScheduleDAGMI *DAG) const {
9604 // Borrowed from Arm Target
9605 // We would like to restrict this hazard recognizer to only
9606 // post-RA scheduling; we can tell that we're post-RA because we don't
9607 // track VRegLiveness.
9608 if (!DAG->hasVRegLiveness())
9609 return new GCNHazardRecognizer(DAG->MF);
9611}
9612
9613std::pair<unsigned, unsigned>
9615 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9616}
9617
9620 static const std::pair<unsigned, const char *> TargetFlags[] = {
9621 {MO_GOTPCREL, "amdgpu-gotprel"},
9622 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9623 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9624 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9625 {MO_REL32_LO, "amdgpu-rel32-lo"},
9626 {MO_REL32_HI, "amdgpu-rel32-hi"},
9627 {MO_REL64, "amdgpu-rel64"},
9628 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9629 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9630 {MO_ABS64, "amdgpu-abs64"},
9631 };
9632
9633 return ArrayRef(TargetFlags);
9634}
9635
9638 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9639 {
9640 {MONoClobber, "amdgpu-noclobber"},
9641 {MOLastUse, "amdgpu-last-use"},
9642 {MOCooperative, "amdgpu-cooperative"},
9643 };
9644
9645 return ArrayRef(TargetFlags);
9646}
9647
9649 const MachineFunction &MF) const {
9651 assert(SrcReg.isVirtual());
9652 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9653 return AMDGPU::WWM_COPY;
9654
9655 return AMDGPU::COPY;
9656}
9657
9659 Register Reg) const {
9660 // We need to handle instructions which may be inserted during register
9661 // allocation to handle the prolog. The initial prolog instruction may have
9662 // been separated from the start of the block by spills and copies inserted
9663 // needed by the prolog. However, the insertions for scalar registers can
9664 // always be placed at the BB top as they are independent of the exec mask
9665 // value.
9666 const MachineFunction *MF = MI.getParent()->getParent();
9667 bool IsNullOrVectorRegister = true;
9668 if (Reg) {
9669 const MachineRegisterInfo &MRI = MF->getRegInfo();
9670 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9671 }
9672
9673 uint16_t Opcode = MI.getOpcode();
9675 return IsNullOrVectorRegister &&
9676 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9677 (Opcode == AMDGPU::IMPLICIT_DEF &&
9678 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
9679 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9680 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9681}
9682
9686 const DebugLoc &DL,
9687 Register DestReg) const {
9688 if (ST.hasAddNoCarry())
9689 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9690
9691 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9692 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9693 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9694
9695 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9696 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9697}
9698
9701 const DebugLoc &DL,
9702 Register DestReg,
9703 RegScavenger &RS) const {
9704 if (ST.hasAddNoCarry())
9705 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9706
9707 // If available, prefer to use vcc.
9708 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9709 ? Register(RI.getVCC())
9710 : RS.scavengeRegisterBackwards(
9711 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9712 0, /* AllowSpill */ false);
9713
9714 // TODO: Users need to deal with this.
9715 if (!UnusedCarry.isValid())
9716 return MachineInstrBuilder();
9717
9718 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9719 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9720}
9721
9722bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9723 switch (Opcode) {
9724 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9725 case AMDGPU::SI_KILL_I1_TERMINATOR:
9726 return true;
9727 default:
9728 return false;
9729 }
9730}
9731
9733 switch (Opcode) {
9734 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9735 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9736 case AMDGPU::SI_KILL_I1_PSEUDO:
9737 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9738 default:
9739 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9740 }
9741}
9742
9743bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9744 return Imm <= getMaxMUBUFImmOffset(ST);
9745}
9746
9748 // GFX12 field is non-negative 24-bit signed byte offset.
9749 const unsigned OffsetBits =
9750 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9751 return (1 << OffsetBits) - 1;
9752}
9753
9755 if (!ST.isWave32())
9756 return;
9757
9758 if (MI.isInlineAsm())
9759 return;
9760
9761 for (auto &Op : MI.implicit_operands()) {
9762 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9763 Op.setReg(AMDGPU::VCC_LO);
9764 }
9765}
9766
9768 if (!isSMRD(MI))
9769 return false;
9770
9771 // Check that it is using a buffer resource.
9772 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9773 if (Idx == -1) // e.g. s_memtime
9774 return false;
9775
9776 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
9777 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9778}
9779
9780// Given Imm, split it into the values to put into the SOffset and ImmOffset
9781// fields in an MUBUF instruction. Return false if it is not possible (due to a
9782// hardware bug needing a workaround).
9783//
9784// The required alignment ensures that individual address components remain
9785// aligned if they are aligned to begin with. It also ensures that additional
9786// offsets within the given alignment can be added to the resulting ImmOffset.
9788 uint32_t &ImmOffset, Align Alignment) const {
9789 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9790 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9791 uint32_t Overflow = 0;
9792
9793 if (Imm > MaxImm) {
9794 if (Imm <= MaxImm + 64) {
9795 // Use an SOffset inline constant for 4..64
9796 Overflow = Imm - MaxImm;
9797 Imm = MaxImm;
9798 } else {
9799 // Try to keep the same value in SOffset for adjacent loads, so that
9800 // the corresponding register contents can be re-used.
9801 //
9802 // Load values with all low-bits (except for alignment bits) set into
9803 // SOffset, so that a larger range of values can be covered using
9804 // s_movk_i32.
9805 //
9806 // Atomic operations fail to work correctly when individual address
9807 // components are unaligned, even if their sum is aligned.
9808 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9809 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9810 Imm = Low;
9811 Overflow = High - Alignment.value();
9812 }
9813 }
9814
9815 if (Overflow > 0) {
9816 // There is a hardware bug in SI and CI which prevents address clamping in
9817 // MUBUF instructions from working correctly with SOffsets. The immediate
9818 // offset is unaffected.
9819 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
9820 return false;
9821
9822 // It is not possible to set immediate in SOffset field on some targets.
9823 if (ST.hasRestrictedSOffset())
9824 return false;
9825 }
9826
9827 ImmOffset = Imm;
9828 SOffset = Overflow;
9829 return true;
9830}
9831
9832// Depending on the used address space and instructions, some immediate offsets
9833// are allowed and some are not.
9834// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9835// scratch instruction offsets can also be negative. On GFX12, offsets can be
9836// negative for all variants.
9837//
9838// There are several bugs related to these offsets:
9839// On gfx10.1, flat instructions that go into the global address space cannot
9840// use an offset.
9841//
9842// For scratch instructions, the address can be either an SGPR or a VGPR.
9843// The following offsets can be used, depending on the architecture (x means
9844// cannot be used):
9845// +----------------------------+------+------+
9846// | Address-Mode | SGPR | VGPR |
9847// +----------------------------+------+------+
9848// | gfx9 | | |
9849// | negative, 4-aligned offset | x | ok |
9850// | negative, unaligned offset | x | ok |
9851// +----------------------------+------+------+
9852// | gfx10 | | |
9853// | negative, 4-aligned offset | ok | ok |
9854// | negative, unaligned offset | ok | x |
9855// +----------------------------+------+------+
9856// | gfx10.3 | | |
9857// | negative, 4-aligned offset | ok | ok |
9858// | negative, unaligned offset | ok | ok |
9859// +----------------------------+------+------+
9860//
9861// This function ignores the addressing mode, so if an offset cannot be used in
9862// one addressing mode, it is considered illegal.
9863bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9864 uint64_t FlatVariant) const {
9865 // TODO: Should 0 be special cased?
9866 if (!ST.hasFlatInstOffsets())
9867 return false;
9868
9869 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9870 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9871 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9872 return false;
9873
9874 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9875 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9876 (Offset % 4) != 0) {
9877 return false;
9878 }
9879
9880 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9881 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9882 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9883}
9884
9885// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9886std::pair<int64_t, int64_t>
9887SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9888 uint64_t FlatVariant) const {
9889 int64_t RemainderOffset = COffsetVal;
9890 int64_t ImmField = 0;
9891
9892 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9893 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9894
9895 if (AllowNegative) {
9896 // Use signed division by a power of two to truncate towards 0.
9897 int64_t D = 1LL << NumBits;
9898 RemainderOffset = (COffsetVal / D) * D;
9899 ImmField = COffsetVal - RemainderOffset;
9900
9901 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9902 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9903 (ImmField % 4) != 0) {
9904 // Make ImmField a multiple of 4
9905 RemainderOffset += ImmField % 4;
9906 ImmField -= ImmField % 4;
9907 }
9908 } else if (COffsetVal >= 0) {
9909 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9910 RemainderOffset = COffsetVal - ImmField;
9911 }
9912
9913 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9914 assert(RemainderOffset + ImmField == COffsetVal);
9915 return {ImmField, RemainderOffset};
9916}
9917
9919 if (ST.hasNegativeScratchOffsetBug() &&
9920 FlatVariant == SIInstrFlags::FlatScratch)
9921 return false;
9922
9923 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9924}
9925
9926static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9927 switch (ST.getGeneration()) {
9928 default:
9929 break;
9932 return SIEncodingFamily::SI;
9935 return SIEncodingFamily::VI;
9941 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
9943 }
9944 llvm_unreachable("Unknown subtarget generation!");
9945}
9946
9947bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9948 switch(MCOp) {
9949 // These opcodes use indirect register addressing so
9950 // they need special handling by codegen (currently missing).
9951 // Therefore it is too risky to allow these opcodes
9952 // to be selected by dpp combiner or sdwa peepholer.
9953 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9954 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9955 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9956 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9957 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9958 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9959 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9960 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9961 return true;
9962 default:
9963 return false;
9964 }
9965}
9966
9967#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
9968 case OPCODE##_dpp: \
9969 case OPCODE##_e32: \
9970 case OPCODE##_e64: \
9971 case OPCODE##_e64_dpp: \
9972 case OPCODE##_sdwa:
9973
9974static bool isRenamedInGFX9(int Opcode) {
9975 switch (Opcode) {
9976 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
9977 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
9978 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
9979 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
9980 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
9981 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
9982 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
9983 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
9984 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
9985 //
9986 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
9987 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
9988 case AMDGPU::V_FMA_F16_gfx9_e64:
9989 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
9990 case AMDGPU::V_INTERP_P2_F16:
9991 case AMDGPU::V_MAD_F16_e64:
9992 case AMDGPU::V_MAD_U16_e64:
9993 case AMDGPU::V_MAD_I16_e64:
9994 return true;
9995 default:
9996 return false;
9997 }
9998}
9999
10000int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10001 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
10002
10003 unsigned Gen = subtargetEncodingFamily(ST);
10004
10005 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10007
10008 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10009 // subtarget has UnpackedD16VMem feature.
10010 // TODO: remove this when we discard GFX80 encoding.
10011 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10013
10014 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10015 switch (ST.getGeneration()) {
10016 default:
10018 break;
10021 break;
10024 break;
10025 }
10026 }
10027
10028 if (isMAI(Opcode)) {
10029 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10030 if (MFMAOp != -1)
10031 Opcode = MFMAOp;
10032 }
10033
10034 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10035
10036 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
10038
10039 // -1 means that Opcode is already a native instruction.
10040 if (MCOp == -1)
10041 return Opcode;
10042
10043 if (ST.hasGFX90AInsts()) {
10044 uint16_t NMCOp = (uint16_t)-1;
10045 if (ST.hasGFX940Insts())
10047 if (NMCOp == (uint16_t)-1)
10049 if (NMCOp == (uint16_t)-1)
10051 if (NMCOp != (uint16_t)-1)
10052 MCOp = NMCOp;
10053 }
10054
10055 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
10056 // no encoding in the given subtarget generation.
10057 if (MCOp == (uint16_t)-1)
10058 return -1;
10059
10060 if (isAsmOnlyOpcode(MCOp))
10061 return -1;
10062
10063 return MCOp;
10064}
10065
10066static
10068 assert(RegOpnd.isReg());
10069 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10070 getRegSubRegPair(RegOpnd);
10071}
10072
10075 assert(MI.isRegSequence());
10076 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10077 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10078 auto &RegOp = MI.getOperand(1 + 2 * I);
10079 return getRegOrUndef(RegOp);
10080 }
10082}
10083
10084// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10085// Following a subreg of reg:subreg isn't supported
10088 if (!RSR.SubReg)
10089 return false;
10090 switch (MI.getOpcode()) {
10091 default: break;
10092 case AMDGPU::REG_SEQUENCE:
10093 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10094 return true;
10095 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10096 case AMDGPU::INSERT_SUBREG:
10097 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10098 // inserted the subreg we're looking for
10099 RSR = getRegOrUndef(MI.getOperand(2));
10100 else { // the subreg in the rest of the reg
10101 auto R1 = getRegOrUndef(MI.getOperand(1));
10102 if (R1.SubReg) // subreg of subreg isn't supported
10103 return false;
10104 RSR.Reg = R1.Reg;
10105 }
10106 return true;
10107 }
10108 return false;
10109}
10110
10113 assert(MRI.isSSA());
10114 if (!P.Reg.isVirtual())
10115 return nullptr;
10116
10117 auto RSR = P;
10118 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10119 while (auto *MI = DefInst) {
10120 DefInst = nullptr;
10121 switch (MI->getOpcode()) {
10122 case AMDGPU::COPY:
10123 case AMDGPU::V_MOV_B32_e32: {
10124 auto &Op1 = MI->getOperand(1);
10125 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10126 if (Op1.isUndef())
10127 return nullptr;
10128 RSR = getRegSubRegPair(Op1);
10129 DefInst = MRI.getVRegDef(RSR.Reg);
10130 }
10131 break;
10132 }
10133 default:
10134 if (followSubRegDef(*MI, RSR)) {
10135 if (!RSR.Reg)
10136 return nullptr;
10137 DefInst = MRI.getVRegDef(RSR.Reg);
10138 }
10139 }
10140 if (!DefInst)
10141 return MI;
10142 }
10143 return nullptr;
10144}
10145
10147 Register VReg,
10148 const MachineInstr &DefMI,
10149 const MachineInstr &UseMI) {
10150 assert(MRI.isSSA() && "Must be run on SSA");
10151
10152 auto *TRI = MRI.getTargetRegisterInfo();
10153 auto *DefBB = DefMI.getParent();
10154
10155 // Don't bother searching between blocks, although it is possible this block
10156 // doesn't modify exec.
10157 if (UseMI.getParent() != DefBB)
10158 return true;
10159
10160 const int MaxInstScan = 20;
10161 int NumInst = 0;
10162
10163 // Stop scan at the use.
10164 auto E = UseMI.getIterator();
10165 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10166 if (I->isDebugInstr())
10167 continue;
10168
10169 if (++NumInst > MaxInstScan)
10170 return true;
10171
10172 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10173 return true;
10174 }
10175
10176 return false;
10177}
10178
10180 Register VReg,
10181 const MachineInstr &DefMI) {
10182 assert(MRI.isSSA() && "Must be run on SSA");
10183
10184 auto *TRI = MRI.getTargetRegisterInfo();
10185 auto *DefBB = DefMI.getParent();
10186
10187 const int MaxUseScan = 10;
10188 int NumUse = 0;
10189
10190 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10191 auto &UseInst = *Use.getParent();
10192 // Don't bother searching between blocks, although it is possible this block
10193 // doesn't modify exec.
10194 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10195 return true;
10196
10197 if (++NumUse > MaxUseScan)
10198 return true;
10199 }
10200
10201 if (NumUse == 0)
10202 return false;
10203
10204 const int MaxInstScan = 20;
10205 int NumInst = 0;
10206
10207 // Stop scan when we have seen all the uses.
10208 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10209 assert(I != DefBB->end());
10210
10211 if (I->isDebugInstr())
10212 continue;
10213
10214 if (++NumInst > MaxInstScan)
10215 return true;
10216
10217 for (const MachineOperand &Op : I->operands()) {
10218 // We don't check reg masks here as they're used only on calls:
10219 // 1. EXEC is only considered const within one BB
10220 // 2. Call should be a terminator instruction if present in a BB
10221
10222 if (!Op.isReg())
10223 continue;
10224
10225 Register Reg = Op.getReg();
10226 if (Op.isUse()) {
10227 if (Reg == VReg && --NumUse == 0)
10228 return false;
10229 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10230 return true;
10231 }
10232 }
10233}
10234
10237 const DebugLoc &DL, Register Src, Register Dst) const {
10238 auto Cur = MBB.begin();
10239 if (Cur != MBB.end())
10240 do {
10241 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10242 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10243 ++Cur;
10244 } while (Cur != MBB.end() && Cur != LastPHIIt);
10245
10246 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10247 Dst);
10248}
10249
10252 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10253 if (InsPt != MBB.end() &&
10254 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10255 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10256 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10257 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10258 InsPt++;
10259 return BuildMI(MBB, InsPt, DL,
10260 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10261 .addReg(Src, 0, SrcSubReg)
10262 .addReg(AMDGPU::EXEC, RegState::Implicit);
10263 }
10264 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10265 Dst);
10266}
10267
10268bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10269
10272 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10273 VirtRegMap *VRM) const {
10274 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10275 //
10276 // %0:sreg_32 = COPY $m0
10277 //
10278 // We explicitly chose SReg_32 for the virtual register so such a copy might
10279 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10280 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10281 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10282 // TargetInstrInfo::foldMemoryOperand() is going to try.
10283 // A similar issue also exists with spilling and reloading $exec registers.
10284 //
10285 // To prevent that, constrain the %0 register class here.
10286 if (isFullCopyInstr(MI)) {
10287 Register DstReg = MI.getOperand(0).getReg();
10288 Register SrcReg = MI.getOperand(1).getReg();
10289 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10290 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10292 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10293 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10294 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10295 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10296 return nullptr;
10297 }
10298 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10299 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10300 return nullptr;
10301 }
10302 }
10303 }
10304
10305 return nullptr;
10306}
10307
10309 const MachineInstr &MI,
10310 unsigned *PredCost) const {
10311 if (MI.isBundle()) {
10313 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10314 unsigned Lat = 0, Count = 0;
10315 for (++I; I != E && I->isBundledWithPred(); ++I) {
10316 ++Count;
10317 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10318 }
10319 return Lat + Count - 1;
10320 }
10321
10322 return SchedModel.computeInstrLatency(&MI);
10323}
10324
10327 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10328 unsigned Opcode = MI.getOpcode();
10329
10330 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10331 Register Dst = MI.getOperand(0).getReg();
10332 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10333 : MI.getOperand(1).getReg();
10334 LLT DstTy = MRI.getType(Dst);
10335 LLT SrcTy = MRI.getType(Src);
10336 unsigned DstAS = DstTy.getAddressSpace();
10337 unsigned SrcAS = SrcTy.getAddressSpace();
10338 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10339 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10340 ST.hasGloballyAddressableScratch()
10343 };
10344
10345 // If the target supports globally addressable scratch, the mapping from
10346 // scratch memory to the flat aperture changes therefore an address space cast
10347 // is no longer uniform.
10348 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10349 return HandleAddrSpaceCast(MI);
10350
10351 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10352 auto IID = GI->getIntrinsicID();
10357
10358 switch (IID) {
10359 case Intrinsic::amdgcn_addrspacecast_nonnull:
10360 return HandleAddrSpaceCast(MI);
10361 case Intrinsic::amdgcn_if:
10362 case Intrinsic::amdgcn_else:
10363 // FIXME: Uniform if second result
10364 break;
10365 }
10366
10368 }
10369
10370 // Loads from the private and flat address spaces are divergent, because
10371 // threads can execute the load instruction with the same inputs and get
10372 // different results.
10373 //
10374 // All other loads are not divergent, because if threads issue loads with the
10375 // same arguments, they will always get the same result.
10376 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10377 Opcode == AMDGPU::G_SEXTLOAD) {
10378 if (MI.memoperands_empty())
10379 return InstructionUniformity::NeverUniform; // conservative assumption
10380
10381 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10382 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10383 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10384 })) {
10385 // At least one MMO in a non-global address space.
10387 }
10389 }
10390
10391 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10392 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10393 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10394 AMDGPU::isGenericAtomic(Opcode)) {
10396 }
10398}
10399
10402
10403 if (isNeverUniform(MI))
10405
10406 unsigned opcode = MI.getOpcode();
10407 if (opcode == AMDGPU::V_READLANE_B32 ||
10408 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10409 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10411
10412 if (isCopyInstr(MI)) {
10413 const MachineOperand &srcOp = MI.getOperand(1);
10414 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10415 const TargetRegisterClass *regClass =
10416 RI.getPhysRegBaseClass(srcOp.getReg());
10417 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10419 }
10421 }
10422
10423 // GMIR handling
10424 if (MI.isPreISelOpcode())
10426
10427 // Atomics are divergent because they are executed sequentially: when an
10428 // atomic operation refers to the same address in each thread, then each
10429 // thread after the first sees the value written by the previous thread as
10430 // original value.
10431
10432 if (isAtomic(MI))
10434
10435 // Loads from the private and flat address spaces are divergent, because
10436 // threads can execute the load instruction with the same inputs and get
10437 // different results.
10438 if (isFLAT(MI) && MI.mayLoad()) {
10439 if (MI.memoperands_empty())
10440 return InstructionUniformity::NeverUniform; // conservative assumption
10441
10442 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10443 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10444 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10445 })) {
10446 // At least one MMO in a non-global address space.
10448 }
10449
10451 }
10452
10453 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
10454 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10455
10456 // FIXME: It's conceptually broken to report this for an instruction, and not
10457 // a specific def operand. For inline asm in particular, there could be mixed
10458 // uniform and divergent results.
10459 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10460 const MachineOperand &SrcOp = MI.getOperand(I);
10461 if (!SrcOp.isReg())
10462 continue;
10463
10464 Register Reg = SrcOp.getReg();
10465 if (!Reg || !SrcOp.readsReg())
10466 continue;
10467
10468 // If RegBank is null, this is unassigned or an unallocatable special
10469 // register, which are all scalars.
10470 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10471 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10473 }
10474
10475 // TODO: Uniformity check condtions above can be rearranged for more
10476 // redability
10477
10478 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10479 // currently turned into no-op COPYs by SelectionDAG ISel and are
10480 // therefore no longer recognizable.
10481
10483}
10484
10486 switch (MF.getFunction().getCallingConv()) {
10488 return 1;
10490 return 2;
10492 return 3;
10496 const Function &F = MF.getFunction();
10497 F.getContext().diagnose(DiagnosticInfoUnsupported(
10498 F, "ds_ordered_count unsupported for this calling conv"));
10499 [[fallthrough]];
10500 }
10503 case CallingConv::C:
10504 case CallingConv::Fast:
10505 default:
10506 // Assume other calling conventions are various compute callable functions
10507 return 0;
10508 }
10509}
10510
10512 Register &SrcReg2, int64_t &CmpMask,
10513 int64_t &CmpValue) const {
10514 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10515 return false;
10516
10517 switch (MI.getOpcode()) {
10518 default:
10519 break;
10520 case AMDGPU::S_CMP_EQ_U32:
10521 case AMDGPU::S_CMP_EQ_I32:
10522 case AMDGPU::S_CMP_LG_U32:
10523 case AMDGPU::S_CMP_LG_I32:
10524 case AMDGPU::S_CMP_LT_U32:
10525 case AMDGPU::S_CMP_LT_I32:
10526 case AMDGPU::S_CMP_GT_U32:
10527 case AMDGPU::S_CMP_GT_I32:
10528 case AMDGPU::S_CMP_LE_U32:
10529 case AMDGPU::S_CMP_LE_I32:
10530 case AMDGPU::S_CMP_GE_U32:
10531 case AMDGPU::S_CMP_GE_I32:
10532 case AMDGPU::S_CMP_EQ_U64:
10533 case AMDGPU::S_CMP_LG_U64:
10534 SrcReg = MI.getOperand(0).getReg();
10535 if (MI.getOperand(1).isReg()) {
10536 if (MI.getOperand(1).getSubReg())
10537 return false;
10538 SrcReg2 = MI.getOperand(1).getReg();
10539 CmpValue = 0;
10540 } else if (MI.getOperand(1).isImm()) {
10541 SrcReg2 = Register();
10542 CmpValue = MI.getOperand(1).getImm();
10543 } else {
10544 return false;
10545 }
10546 CmpMask = ~0;
10547 return true;
10548 case AMDGPU::S_CMPK_EQ_U32:
10549 case AMDGPU::S_CMPK_EQ_I32:
10550 case AMDGPU::S_CMPK_LG_U32:
10551 case AMDGPU::S_CMPK_LG_I32:
10552 case AMDGPU::S_CMPK_LT_U32:
10553 case AMDGPU::S_CMPK_LT_I32:
10554 case AMDGPU::S_CMPK_GT_U32:
10555 case AMDGPU::S_CMPK_GT_I32:
10556 case AMDGPU::S_CMPK_LE_U32:
10557 case AMDGPU::S_CMPK_LE_I32:
10558 case AMDGPU::S_CMPK_GE_U32:
10559 case AMDGPU::S_CMPK_GE_I32:
10560 SrcReg = MI.getOperand(0).getReg();
10561 SrcReg2 = Register();
10562 CmpValue = MI.getOperand(1).getImm();
10563 CmpMask = ~0;
10564 return true;
10565 }
10566
10567 return false;
10568}
10569
10571 Register SrcReg2, int64_t CmpMask,
10572 int64_t CmpValue,
10573 const MachineRegisterInfo *MRI) const {
10574 if (!SrcReg || SrcReg.isPhysical())
10575 return false;
10576
10577 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10578 return false;
10579
10580 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
10581 this](int64_t ExpectedValue, unsigned SrcSize,
10582 bool IsReversible, bool IsSigned) -> bool {
10583 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10584 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10585 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10586 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10587 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
10588 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10589 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10590 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10591 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10592 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
10593 //
10594 // Signed ge/gt are not used for the sign bit.
10595 //
10596 // If result of the AND is unused except in the compare:
10597 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
10598 //
10599 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10600 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10601 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
10602 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10603 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10604 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
10605
10606 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
10607 if (!Def || Def->getParent() != CmpInstr.getParent())
10608 return false;
10609
10610 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
10611 Def->getOpcode() != AMDGPU::S_AND_B64)
10612 return false;
10613
10614 int64_t Mask;
10615 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
10616 if (MO->isImm())
10617 Mask = MO->getImm();
10618 else if (!getFoldableImm(MO, Mask))
10619 return false;
10620 Mask &= maxUIntN(SrcSize);
10621 return isPowerOf2_64(Mask);
10622 };
10623
10624 MachineOperand *SrcOp = &Def->getOperand(1);
10625 if (isMask(SrcOp))
10626 SrcOp = &Def->getOperand(2);
10627 else if (isMask(&Def->getOperand(2)))
10628 SrcOp = &Def->getOperand(1);
10629 else
10630 return false;
10631
10632 // A valid Mask is required to have a single bit set, hence a non-zero and
10633 // power-of-two value. This verifies that we will not do 64-bit shift below.
10634 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
10635 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
10636 if (IsSigned && BitNo == SrcSize - 1)
10637 return false;
10638
10639 ExpectedValue <<= BitNo;
10640
10641 bool IsReversedCC = false;
10642 if (CmpValue != ExpectedValue) {
10643 if (!IsReversible)
10644 return false;
10645 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
10646 if (!IsReversedCC)
10647 return false;
10648 }
10649
10650 Register DefReg = Def->getOperand(0).getReg();
10651 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
10652 return false;
10653
10654 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
10655 I != E; ++I) {
10656 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
10657 I->killsRegister(AMDGPU::SCC, &RI))
10658 return false;
10659 }
10660
10661 MachineOperand *SccDef =
10662 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
10663 SccDef->setIsDead(false);
10664 CmpInstr.eraseFromParent();
10665
10666 if (!MRI->use_nodbg_empty(DefReg)) {
10667 assert(!IsReversedCC);
10668 return true;
10669 }
10670
10671 // Replace AND with unused result with a S_BITCMP.
10672 MachineBasicBlock *MBB = Def->getParent();
10673
10674 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
10675 : AMDGPU::S_BITCMP1_B32
10676 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
10677 : AMDGPU::S_BITCMP1_B64;
10678
10679 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
10680 .add(*SrcOp)
10681 .addImm(BitNo);
10682 Def->eraseFromParent();
10683
10684 return true;
10685 };
10686
10687 switch (CmpInstr.getOpcode()) {
10688 default:
10689 break;
10690 case AMDGPU::S_CMP_EQ_U32:
10691 case AMDGPU::S_CMP_EQ_I32:
10692 case AMDGPU::S_CMPK_EQ_U32:
10693 case AMDGPU::S_CMPK_EQ_I32:
10694 return optimizeCmpAnd(1, 32, true, false);
10695 case AMDGPU::S_CMP_GE_U32:
10696 case AMDGPU::S_CMPK_GE_U32:
10697 return optimizeCmpAnd(1, 32, false, false);
10698 case AMDGPU::S_CMP_GE_I32:
10699 case AMDGPU::S_CMPK_GE_I32:
10700 return optimizeCmpAnd(1, 32, false, true);
10701 case AMDGPU::S_CMP_EQ_U64:
10702 return optimizeCmpAnd(1, 64, true, false);
10703 case AMDGPU::S_CMP_LG_U32:
10704 case AMDGPU::S_CMP_LG_I32:
10705 case AMDGPU::S_CMPK_LG_U32:
10706 case AMDGPU::S_CMPK_LG_I32:
10707 return optimizeCmpAnd(0, 32, true, false);
10708 case AMDGPU::S_CMP_GT_U32:
10709 case AMDGPU::S_CMPK_GT_U32:
10710 return optimizeCmpAnd(0, 32, false, false);
10711 case AMDGPU::S_CMP_GT_I32:
10712 case AMDGPU::S_CMPK_GT_I32:
10713 return optimizeCmpAnd(0, 32, false, true);
10714 case AMDGPU::S_CMP_LG_U64:
10715 return optimizeCmpAnd(0, 64, true, false);
10716 }
10717
10718 return false;
10719}
10720
10722 AMDGPU::OpName OpName) const {
10723 if (!ST.needsAlignedVGPRs())
10724 return;
10725
10726 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
10727 if (OpNo < 0)
10728 return;
10729 MachineOperand &Op = MI.getOperand(OpNo);
10730 if (getOpSize(MI, OpNo) > 4)
10731 return;
10732
10733 // Add implicit aligned super-reg to force alignment on the data operand.
10734 const DebugLoc &DL = MI.getDebugLoc();
10735 MachineBasicBlock *BB = MI.getParent();
10737 Register DataReg = Op.getReg();
10738 bool IsAGPR = RI.isAGPR(MRI, DataReg);
10739 Register Undef = MRI.createVirtualRegister(
10740 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10741 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
10742 Register NewVR =
10743 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10744 : &AMDGPU::VReg_64_Align2RegClass);
10745 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
10746 .addReg(DataReg, 0, Op.getSubReg())
10747 .addImm(AMDGPU::sub0)
10748 .addReg(Undef)
10749 .addImm(AMDGPU::sub1);
10750 Op.setReg(NewVR);
10751 Op.setSubReg(AMDGPU::sub0);
10752 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
10753}
10754
10756 if (isIGLP(*MI))
10757 return false;
10758
10760}
10761
10763 if (!isWMMA(MI) && !isSWMMAC(MI))
10764 return false;
10765
10766 if (AMDGPU::isGFX1250(ST))
10767 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
10768
10769 return true;
10770}
10771
10773 unsigned Opcode = MI.getOpcode();
10774
10775 if (AMDGPU::isGFX12Plus(ST))
10776 return isDOT(MI) || isXDLWMMA(MI);
10777
10778 if (!isMAI(MI) || isDGEMM(Opcode) ||
10779 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
10780 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
10781 return false;
10782
10783 if (!ST.hasGFX940Insts())
10784 return true;
10785
10786 return AMDGPU::getMAIIsGFX940XDL(Opcode);
10787}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition ArrayRef.h:150
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasAddNoCarry() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:87
uint8_t OperandType
Information about the type of the operand.
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:96
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:102
constexpr bool isValid() const
Definition Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI) const override
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
static bool usesLGKM_CNT(const MachineInstr &MI)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:231
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:202
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:206
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:201
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:216
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:205
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:212
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:248
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:204
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:242
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:213
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:220
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:203
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:228
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:569
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:571
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:568
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:570
@ TI_CONSTDATA_START
Definition AMDGPU.h:567
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:72
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:70
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:71
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:62
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:73
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:477
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:216
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2474
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
Op::Description Desc
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:257
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:127
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1899
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:583
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
MachineCycleInfo::CycleT MachineCycle
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:219
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:56
MachineInstr * top() const
Definition SIInstrInfo.h:61
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:80
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.