LLVM 22.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(true),
64
66 : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
67 RI(ST), ST(ST) {
68 SchedModel.init(&ST);
69}
70
71//===----------------------------------------------------------------------===//
72// TargetInstrInfo callbacks
73//===----------------------------------------------------------------------===//
74
75static unsigned getNumOperandsNoGlue(SDNode *Node) {
76 unsigned N = Node->getNumOperands();
77 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
78 --N;
79 return N;
80}
81
82/// Returns true if both nodes have the same value for the given
83/// operand \p Op, or if both nodes do not have this operand.
85 AMDGPU::OpName OpName) {
86 unsigned Opc0 = N0->getMachineOpcode();
87 unsigned Opc1 = N1->getMachineOpcode();
88
89 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
90 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
91
92 if (Op0Idx == -1 && Op1Idx == -1)
93 return true;
94
95
96 if ((Op0Idx == -1 && Op1Idx != -1) ||
97 (Op1Idx == -1 && Op0Idx != -1))
98 return false;
99
100 // getNamedOperandIdx returns the index for the MachineInstr's operands,
101 // which includes the result as the first operand. We are indexing into the
102 // MachineSDNode's operands, so we need to skip the result operand to get
103 // the real index.
104 --Op0Idx;
105 --Op1Idx;
106
107 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
108}
109
110static bool canRemat(const MachineInstr &MI) {
111
115 return true;
116
117 if (SIInstrInfo::isSMRD(MI)) {
118 return !MI.memoperands_empty() &&
119 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
120 return MMO->isLoad() && MMO->isInvariant();
121 });
122 }
123
124 return false;
125}
126
128 const MachineInstr &MI) const {
129
130 if (canRemat(MI)) {
131 // Normally VALU use of exec would block the rematerialization, but that
132 // is OK in this case to have an implicit exec read as all VALU do.
133 // We really want all of the generic logic for this except for this.
134
135 // Another potential implicit use is mode register. The core logic of
136 // the RA will not attempt rematerialization if mode is set anywhere
137 // in the function, otherwise it is safe since mode is not changed.
138
139 // There is difference to generic method which does not allow
140 // rematerialization if there are virtual register uses. We allow this,
141 // therefore this method includes SOP instructions as well.
142 if (!MI.hasImplicitDef() &&
143 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
144 !MI.mayRaiseFPException())
145 return true;
146 }
147
149}
150
151// Returns true if the scalar result of a VALU instruction depends on exec.
152bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
153 // Ignore comparisons which are only used masked with exec.
154 // This allows some hoisting/sinking of VALU comparisons.
155 if (MI.isCompare()) {
156 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
157 if (!Dst)
158 return true;
159
160 Register DstReg = Dst->getReg();
161 if (!DstReg.isVirtual())
162 return true;
163
164 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
165 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
166 switch (Use.getOpcode()) {
167 case AMDGPU::S_AND_SAVEEXEC_B32:
168 case AMDGPU::S_AND_SAVEEXEC_B64:
169 break;
170 case AMDGPU::S_AND_B32:
171 case AMDGPU::S_AND_B64:
172 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
173 return true;
174 break;
175 default:
176 return true;
177 }
178 }
179 return false;
180 }
181
182 switch (MI.getOpcode()) {
183 default:
184 break;
185 case AMDGPU::V_READFIRSTLANE_B32:
186 return true;
187 }
188
189 return false;
190}
191
193 // Any implicit use of exec by VALU is not a real register read.
194 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
195 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
196}
197
199 MachineBasicBlock *SuccToSinkTo,
200 MachineCycleInfo *CI) const {
201 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
202 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
203 return true;
204
205 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
206 // Check if sinking of MI would create temporal divergent use.
207 for (auto Op : MI.uses()) {
208 if (Op.isReg() && Op.getReg().isVirtual() &&
209 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
210 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
211
212 // SgprDef defined inside cycle
213 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
214 if (FromCycle == nullptr)
215 continue;
216
217 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
218 // Check if there is a FromCycle that contains SgprDef's basic block but
219 // does not contain SuccToSinkTo and also has divergent exit condition.
220 while (FromCycle && !FromCycle->contains(ToCycle)) {
222 FromCycle->getExitingBlocks(ExitingBlocks);
223
224 // FromCycle has divergent exit condition.
225 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
226 if (hasDivergentBranch(ExitingBlock))
227 return false;
228 }
229
230 FromCycle = FromCycle->getParentCycle();
231 }
232 }
233 }
234
235 return true;
236}
237
239 int64_t &Offset0,
240 int64_t &Offset1) const {
241 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
242 return false;
243
244 unsigned Opc0 = Load0->getMachineOpcode();
245 unsigned Opc1 = Load1->getMachineOpcode();
246
247 // Make sure both are actually loads.
248 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
249 return false;
250
251 // A mayLoad instruction without a def is not a load. Likely a prefetch.
252 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
253 return false;
254
255 if (isDS(Opc0) && isDS(Opc1)) {
256
257 // FIXME: Handle this case:
258 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
259 return false;
260
261 // Check base reg.
262 if (Load0->getOperand(0) != Load1->getOperand(0))
263 return false;
264
265 // Skip read2 / write2 variants for simplicity.
266 // TODO: We should report true if the used offsets are adjacent (excluded
267 // st64 versions).
268 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
269 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
270 if (Offset0Idx == -1 || Offset1Idx == -1)
271 return false;
272
273 // XXX - be careful of dataless loads
274 // getNamedOperandIdx returns the index for MachineInstrs. Since they
275 // include the output in the operand list, but SDNodes don't, we need to
276 // subtract the index by one.
277 Offset0Idx -= get(Opc0).NumDefs;
278 Offset1Idx -= get(Opc1).NumDefs;
279 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
280 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
281 return true;
282 }
283
284 if (isSMRD(Opc0) && isSMRD(Opc1)) {
285 // Skip time and cache invalidation instructions.
286 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
287 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
288 return false;
289
290 unsigned NumOps = getNumOperandsNoGlue(Load0);
291 if (NumOps != getNumOperandsNoGlue(Load1))
292 return false;
293
294 // Check base reg.
295 if (Load0->getOperand(0) != Load1->getOperand(0))
296 return false;
297
298 // Match register offsets, if both register and immediate offsets present.
299 assert(NumOps == 4 || NumOps == 5);
300 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
301 return false;
302
303 const ConstantSDNode *Load0Offset =
305 const ConstantSDNode *Load1Offset =
307
308 if (!Load0Offset || !Load1Offset)
309 return false;
310
311 Offset0 = Load0Offset->getZExtValue();
312 Offset1 = Load1Offset->getZExtValue();
313 return true;
314 }
315
316 // MUBUF and MTBUF can access the same addresses.
317 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
318
319 // MUBUF and MTBUF have vaddr at different indices.
320 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
321 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
322 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
323 return false;
324
325 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
326 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
327
328 if (OffIdx0 == -1 || OffIdx1 == -1)
329 return false;
330
331 // getNamedOperandIdx returns the index for MachineInstrs. Since they
332 // include the output in the operand list, but SDNodes don't, we need to
333 // subtract the index by one.
334 OffIdx0 -= get(Opc0).NumDefs;
335 OffIdx1 -= get(Opc1).NumDefs;
336
337 SDValue Off0 = Load0->getOperand(OffIdx0);
338 SDValue Off1 = Load1->getOperand(OffIdx1);
339
340 // The offset might be a FrameIndexSDNode.
341 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
342 return false;
343
344 Offset0 = Off0->getAsZExtVal();
345 Offset1 = Off1->getAsZExtVal();
346 return true;
347 }
348
349 return false;
350}
351
352static bool isStride64(unsigned Opc) {
353 switch (Opc) {
354 case AMDGPU::DS_READ2ST64_B32:
355 case AMDGPU::DS_READ2ST64_B64:
356 case AMDGPU::DS_WRITE2ST64_B32:
357 case AMDGPU::DS_WRITE2ST64_B64:
358 return true;
359 default:
360 return false;
361 }
362}
363
366 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
367 const TargetRegisterInfo *TRI) const {
368 if (!LdSt.mayLoadOrStore())
369 return false;
370
371 unsigned Opc = LdSt.getOpcode();
372 OffsetIsScalable = false;
373 const MachineOperand *BaseOp, *OffsetOp;
374 int DataOpIdx;
375
376 if (isDS(LdSt)) {
377 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
378 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
379 if (OffsetOp) {
380 // Normal, single offset LDS instruction.
381 if (!BaseOp) {
382 // DS_CONSUME/DS_APPEND use M0 for the base address.
383 // TODO: find the implicit use operand for M0 and use that as BaseOp?
384 return false;
385 }
386 BaseOps.push_back(BaseOp);
387 Offset = OffsetOp->getImm();
388 // Get appropriate operand, and compute width accordingly.
389 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
390 if (DataOpIdx == -1)
391 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
392 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
393 Width = LocationSize::precise(64);
394 else
395 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
396 } else {
397 // The 2 offset instructions use offset0 and offset1 instead. We can treat
398 // these as a load with a single offset if the 2 offsets are consecutive.
399 // We will use this for some partially aligned loads.
400 const MachineOperand *Offset0Op =
401 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
402 const MachineOperand *Offset1Op =
403 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
404
405 unsigned Offset0 = Offset0Op->getImm() & 0xff;
406 unsigned Offset1 = Offset1Op->getImm() & 0xff;
407 if (Offset0 + 1 != Offset1)
408 return false;
409
410 // Each of these offsets is in element sized units, so we need to convert
411 // to bytes of the individual reads.
412
413 unsigned EltSize;
414 if (LdSt.mayLoad())
415 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
416 else {
417 assert(LdSt.mayStore());
418 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
419 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
420 }
421
422 if (isStride64(Opc))
423 EltSize *= 64;
424
425 BaseOps.push_back(BaseOp);
426 Offset = EltSize * Offset0;
427 // Get appropriate operand(s), and compute width accordingly.
428 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
429 if (DataOpIdx == -1) {
430 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
431 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
432 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
433 Width = LocationSize::precise(
434 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
435 } else {
436 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
437 }
438 }
439 return true;
440 }
441
442 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
443 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
444 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
445 return false;
446 BaseOps.push_back(RSrc);
447 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
448 if (BaseOp && !BaseOp->isFI())
449 BaseOps.push_back(BaseOp);
450 const MachineOperand *OffsetImm =
451 getNamedOperand(LdSt, AMDGPU::OpName::offset);
452 Offset = OffsetImm->getImm();
453 const MachineOperand *SOffset =
454 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
455 if (SOffset) {
456 if (SOffset->isReg())
457 BaseOps.push_back(SOffset);
458 else
459 Offset += SOffset->getImm();
460 }
461 // Get appropriate operand, and compute width accordingly.
462 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
463 if (DataOpIdx == -1)
464 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
465 if (DataOpIdx == -1) // LDS DMA
466 return false;
467 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
468 return true;
469 }
470
471 if (isImage(LdSt)) {
472 auto RsrcOpName =
473 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
474 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
475 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
476 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
477 if (VAddr0Idx >= 0) {
478 // GFX10 possible NSA encoding.
479 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
480 BaseOps.push_back(&LdSt.getOperand(I));
481 } else {
482 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
483 }
484 Offset = 0;
485 // Get appropriate operand, and compute width accordingly.
486 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
487 if (DataOpIdx == -1)
488 return false; // no return sampler
489 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
490 return true;
491 }
492
493 if (isSMRD(LdSt)) {
494 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
495 if (!BaseOp) // e.g. S_MEMTIME
496 return false;
497 BaseOps.push_back(BaseOp);
498 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
499 Offset = OffsetOp ? OffsetOp->getImm() : 0;
500 // Get appropriate operand, and compute width accordingly.
501 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
502 if (DataOpIdx == -1)
503 return false;
504 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
505 return true;
506 }
507
508 if (isFLAT(LdSt)) {
509 // Instructions have either vaddr or saddr or both or none.
510 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
511 if (BaseOp)
512 BaseOps.push_back(BaseOp);
513 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
514 if (BaseOp)
515 BaseOps.push_back(BaseOp);
516 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
517 // Get appropriate operand, and compute width accordingly.
518 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
519 if (DataOpIdx == -1)
520 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
521 if (DataOpIdx == -1) // LDS DMA
522 return false;
523 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
524 return true;
525 }
526
527 return false;
528}
529
530static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
532 const MachineInstr &MI2,
534 // Only examine the first "base" operand of each instruction, on the
535 // assumption that it represents the real base address of the memory access.
536 // Other operands are typically offsets or indices from this base address.
537 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
538 return true;
539
540 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
541 return false;
542
543 auto *MO1 = *MI1.memoperands_begin();
544 auto *MO2 = *MI2.memoperands_begin();
545 if (MO1->getAddrSpace() != MO2->getAddrSpace())
546 return false;
547
548 const auto *Base1 = MO1->getValue();
549 const auto *Base2 = MO2->getValue();
550 if (!Base1 || !Base2)
551 return false;
552 Base1 = getUnderlyingObject(Base1);
553 Base2 = getUnderlyingObject(Base2);
554
555 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
556 return false;
557
558 return Base1 == Base2;
559}
560
562 int64_t Offset1, bool OffsetIsScalable1,
564 int64_t Offset2, bool OffsetIsScalable2,
565 unsigned ClusterSize,
566 unsigned NumBytes) const {
567 // If the mem ops (to be clustered) do not have the same base ptr, then they
568 // should not be clustered
569 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
570 if (!BaseOps1.empty() && !BaseOps2.empty()) {
571 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
572 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
573 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
574 return false;
575
576 const SIMachineFunctionInfo *MFI =
577 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
578 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
579 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
580 // If only one base op is empty, they do not have the same base ptr
581 return false;
582 }
583
584 // In order to avoid register pressure, on an average, the number of DWORDS
585 // loaded together by all clustered mem ops should not exceed
586 // MaxMemoryClusterDWords. This is an empirical value based on certain
587 // observations and performance related experiments.
588 // The good thing about this heuristic is - it avoids clustering of too many
589 // sub-word loads, and also avoids clustering of wide loads. Below is the
590 // brief summary of how the heuristic behaves for various `LoadSize` when
591 // MaxMemoryClusterDWords is 8.
592 //
593 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
594 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
595 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
596 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
597 // (5) LoadSize >= 17: do not cluster
598 const unsigned LoadSize = NumBytes / ClusterSize;
599 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
600 return NumDWords <= MaxMemoryClusterDWords;
601}
602
603// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
604// the first 16 loads will be interleaved with the stores, and the next 16 will
605// be clustered as expected. It should really split into 2 16 store batches.
606//
607// Loads are clustered until this returns false, rather than trying to schedule
608// groups of stores. This also means we have to deal with saying different
609// address space loads should be clustered, and ones which might cause bank
610// conflicts.
611//
612// This might be deprecated so it might not be worth that much effort to fix.
614 int64_t Offset0, int64_t Offset1,
615 unsigned NumLoads) const {
616 assert(Offset1 > Offset0 &&
617 "Second offset should be larger than first offset!");
618 // If we have less than 16 loads in a row, and the offsets are within 64
619 // bytes, then schedule together.
620
621 // A cacheline is 64 bytes (for global memory).
622 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
623}
624
627 const DebugLoc &DL, MCRegister DestReg,
628 MCRegister SrcReg, bool KillSrc,
629 const char *Msg = "illegal VGPR to SGPR copy") {
630 MachineFunction *MF = MBB.getParent();
631
633 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
634
635 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
636 .addReg(SrcReg, getKillRegState(KillSrc));
637}
638
639/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
640/// possible to have a direct copy in these cases on GFX908, so an intermediate
641/// VGPR copy is required.
645 const DebugLoc &DL, MCRegister DestReg,
646 MCRegister SrcReg, bool KillSrc,
647 RegScavenger &RS, bool RegsOverlap,
648 Register ImpDefSuperReg = Register(),
649 Register ImpUseSuperReg = Register()) {
650 assert((TII.getSubtarget().hasMAIInsts() &&
651 !TII.getSubtarget().hasGFX90AInsts()) &&
652 "Expected GFX908 subtarget.");
653
654 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
655 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
656 "Source register of the copy should be either an SGPR or an AGPR.");
657
658 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
659 "Destination register of the copy should be an AGPR.");
660
661 const SIRegisterInfo &RI = TII.getRegisterInfo();
662
663 // First try to find defining accvgpr_write to avoid temporary registers.
664 // In the case of copies of overlapping AGPRs, we conservatively do not
665 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
666 // an accvgpr_write used for this same copy due to implicit-defs
667 if (!RegsOverlap) {
668 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
669 --Def;
670
671 if (!Def->modifiesRegister(SrcReg, &RI))
672 continue;
673
674 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
675 Def->getOperand(0).getReg() != SrcReg)
676 break;
677
678 MachineOperand &DefOp = Def->getOperand(1);
679 assert(DefOp.isReg() || DefOp.isImm());
680
681 if (DefOp.isReg()) {
682 bool SafeToPropagate = true;
683 // Check that register source operand is not clobbered before MI.
684 // Immediate operands are always safe to propagate.
685 for (auto I = Def; I != MI && SafeToPropagate; ++I)
686 if (I->modifiesRegister(DefOp.getReg(), &RI))
687 SafeToPropagate = false;
688
689 if (!SafeToPropagate)
690 break;
691
692 for (auto I = Def; I != MI; ++I)
693 I->clearRegisterKills(DefOp.getReg(), &RI);
694 }
695
696 MachineInstrBuilder Builder =
697 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
698 .add(DefOp);
699 if (ImpDefSuperReg)
700 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
701
702 if (ImpUseSuperReg) {
703 Builder.addReg(ImpUseSuperReg,
705 }
706
707 return;
708 }
709 }
710
712 RS.backward(std::next(MI));
713
714 // Ideally we want to have three registers for a long reg_sequence copy
715 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
716 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
717 *MBB.getParent());
718
719 // Registers in the sequence are allocated contiguously so we can just
720 // use register number to pick one of three round-robin temps.
721 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
722 Register Tmp =
723 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
724 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
725 "VGPR used for an intermediate copy should have been reserved.");
726
727 // Only loop through if there are any free registers left. We don't want to
728 // spill.
729 while (RegNo--) {
730 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
731 /* RestoreAfter */ false, 0,
732 /* AllowSpill */ false);
733 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
734 break;
735 Tmp = Tmp2;
736 RS.setRegUsed(Tmp);
737 }
738
739 // Insert copy to temporary VGPR.
740 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
741 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
742 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
743 } else {
744 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
745 }
746
747 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
748 .addReg(SrcReg, getKillRegState(KillSrc));
749 if (ImpUseSuperReg) {
750 UseBuilder.addReg(ImpUseSuperReg,
752 }
753
754 MachineInstrBuilder DefBuilder
755 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
756 .addReg(Tmp, RegState::Kill);
757
758 if (ImpDefSuperReg)
759 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
760}
761
764 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
765 const TargetRegisterClass *RC, bool Forward) {
766 const SIRegisterInfo &RI = TII.getRegisterInfo();
767 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
769 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
770
771 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
772 int16_t SubIdx = BaseIndices[Idx];
773 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
774 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
775 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
776 unsigned Opcode = AMDGPU::S_MOV_B32;
777
778 // Is SGPR aligned? If so try to combine with next.
779 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
780 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
781 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
782 // Can use SGPR64 copy
783 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
784 SubIdx = RI.getSubRegFromChannel(Channel, 2);
785 DestSubReg = RI.getSubReg(DestReg, SubIdx);
786 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
787 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
788 Opcode = AMDGPU::S_MOV_B64;
789 Idx++;
790 }
791
792 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
793 .addReg(SrcSubReg)
794 .addReg(SrcReg, RegState::Implicit);
795
796 if (!FirstMI)
797 FirstMI = LastMI;
798
799 if (!Forward)
800 I--;
801 }
802
803 assert(FirstMI && LastMI);
804 if (!Forward)
805 std::swap(FirstMI, LastMI);
806
807 FirstMI->addOperand(
808 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
809
810 if (KillSrc)
811 LastMI->addRegisterKilled(SrcReg, &RI);
812}
813
816 const DebugLoc &DL, Register DestReg,
817 Register SrcReg, bool KillSrc, bool RenamableDest,
818 bool RenamableSrc) const {
819 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
820 unsigned Size = RI.getRegSizeInBits(*RC);
821 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
822 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
823
824 // The rest of copyPhysReg assumes Src and Dst size are the same size.
825 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
826 // we remove Fix16BitCopies and this code block?
827 if (Fix16BitCopies) {
828 if (((Size == 16) != (SrcSize == 16))) {
829 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
830 assert(ST.useRealTrue16Insts());
831 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
832 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
833 RegToFix = SubReg;
834
835 if (DestReg == SrcReg) {
836 // Identity copy. Insert empty bundle since ExpandPostRA expects an
837 // instruction here.
838 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
839 return;
840 }
841 RC = RI.getPhysRegBaseClass(DestReg);
842 Size = RI.getRegSizeInBits(*RC);
843 SrcRC = RI.getPhysRegBaseClass(SrcReg);
844 SrcSize = RI.getRegSizeInBits(*SrcRC);
845 }
846 }
847
848 if (RC == &AMDGPU::VGPR_32RegClass) {
849 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
850 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
851 AMDGPU::AGPR_32RegClass.contains(SrcReg));
852 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
853 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
854 BuildMI(MBB, MI, DL, get(Opc), DestReg)
855 .addReg(SrcReg, getKillRegState(KillSrc));
856 return;
857 }
858
859 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
860 RC == &AMDGPU::SReg_32RegClass) {
861 if (SrcReg == AMDGPU::SCC) {
862 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
863 .addImm(1)
864 .addImm(0);
865 return;
866 }
867
868 if (DestReg == AMDGPU::VCC_LO) {
869 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
870 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
871 .addReg(SrcReg, getKillRegState(KillSrc));
872 } else {
873 // FIXME: Hack until VReg_1 removed.
874 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
875 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
876 .addImm(0)
877 .addReg(SrcReg, getKillRegState(KillSrc));
878 }
879
880 return;
881 }
882
883 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
884 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
885 return;
886 }
887
888 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
889 .addReg(SrcReg, getKillRegState(KillSrc));
890 return;
891 }
892
893 if (RC == &AMDGPU::SReg_64RegClass) {
894 if (SrcReg == AMDGPU::SCC) {
895 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
896 .addImm(1)
897 .addImm(0);
898 return;
899 }
900
901 if (DestReg == AMDGPU::VCC) {
902 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
903 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
904 .addReg(SrcReg, getKillRegState(KillSrc));
905 } else {
906 // FIXME: Hack until VReg_1 removed.
907 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
908 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
909 .addImm(0)
910 .addReg(SrcReg, getKillRegState(KillSrc));
911 }
912
913 return;
914 }
915
916 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
917 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
918 return;
919 }
920
921 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
922 .addReg(SrcReg, getKillRegState(KillSrc));
923 return;
924 }
925
926 if (DestReg == AMDGPU::SCC) {
927 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
928 // but SelectionDAG emits such copies for i1 sources.
929 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
930 // This copy can only be produced by patterns
931 // with explicit SCC, which are known to be enabled
932 // only for subtargets with S_CMP_LG_U64 present.
933 assert(ST.hasScalarCompareEq64());
934 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
935 .addReg(SrcReg, getKillRegState(KillSrc))
936 .addImm(0);
937 } else {
938 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
939 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
940 .addReg(SrcReg, getKillRegState(KillSrc))
941 .addImm(0);
942 }
943
944 return;
945 }
946
947 if (RC == &AMDGPU::AGPR_32RegClass) {
948 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
949 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
950 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
951 .addReg(SrcReg, getKillRegState(KillSrc));
952 return;
953 }
954
955 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
956 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
957 .addReg(SrcReg, getKillRegState(KillSrc));
958 return;
959 }
960
961 // FIXME: Pass should maintain scavenger to avoid scan through the block on
962 // every AGPR spill.
963 RegScavenger RS;
964 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
965 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
966 return;
967 }
968
969 if (Size == 16) {
970 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
971 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
972 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
973
974 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
975 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
976 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
977 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
978 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
979 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
980 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
981 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
982
983 if (IsSGPRDst) {
984 if (!IsSGPRSrc) {
985 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
986 return;
987 }
988
989 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
990 .addReg(NewSrcReg, getKillRegState(KillSrc));
991 return;
992 }
993
994 if (IsAGPRDst || IsAGPRSrc) {
995 if (!DstLow || !SrcLow) {
996 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
997 "Cannot use hi16 subreg with an AGPR!");
998 }
999
1000 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
1001 return;
1002 }
1003
1004 if (ST.useRealTrue16Insts()) {
1005 if (IsSGPRSrc) {
1006 assert(SrcLow);
1007 SrcReg = NewSrcReg;
1008 }
1009 // Use the smaller instruction encoding if possible.
1010 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1011 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1012 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1013 .addReg(SrcReg);
1014 } else {
1015 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1016 .addImm(0) // src0_modifiers
1017 .addReg(SrcReg)
1018 .addImm(0); // op_sel
1019 }
1020 return;
1021 }
1022
1023 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1024 if (!DstLow || !SrcLow) {
1025 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1026 "Cannot use hi16 subreg on VI!");
1027 }
1028
1029 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1030 .addReg(NewSrcReg, getKillRegState(KillSrc));
1031 return;
1032 }
1033
1034 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1035 .addImm(0) // src0_modifiers
1036 .addReg(NewSrcReg)
1037 .addImm(0) // clamp
1044 // First implicit operand is $exec.
1045 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1046 return;
1047 }
1048
1049 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1050 if (ST.hasMovB64()) {
1051 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1052 .addReg(SrcReg, getKillRegState(KillSrc));
1053 return;
1054 }
1055 if (ST.hasPkMovB32()) {
1056 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1058 .addReg(SrcReg)
1060 .addReg(SrcReg)
1061 .addImm(0) // op_sel_lo
1062 .addImm(0) // op_sel_hi
1063 .addImm(0) // neg_lo
1064 .addImm(0) // neg_hi
1065 .addImm(0) // clamp
1066 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1067 return;
1068 }
1069 }
1070
1071 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1072 if (RI.isSGPRClass(RC)) {
1073 if (!RI.isSGPRClass(SrcRC)) {
1074 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1075 return;
1076 }
1077 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1078 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1079 Forward);
1080 return;
1081 }
1082
1083 unsigned EltSize = 4;
1084 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1085 if (RI.isAGPRClass(RC)) {
1086 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1087 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1088 else if (RI.hasVGPRs(SrcRC) ||
1089 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1090 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1091 else
1092 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1093 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1094 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1095 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1096 (RI.isProperlyAlignedRC(*RC) &&
1097 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1098 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1099 if (ST.hasMovB64()) {
1100 Opcode = AMDGPU::V_MOV_B64_e32;
1101 EltSize = 8;
1102 } else if (ST.hasPkMovB32()) {
1103 Opcode = AMDGPU::V_PK_MOV_B32;
1104 EltSize = 8;
1105 }
1106 }
1107
1108 // For the cases where we need an intermediate instruction/temporary register
1109 // (destination is an AGPR), we need a scavenger.
1110 //
1111 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1112 // whole block for every handled copy.
1113 std::unique_ptr<RegScavenger> RS;
1114 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1115 RS = std::make_unique<RegScavenger>();
1116
1117 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1118
1119 // If there is an overlap, we can't kill the super-register on the last
1120 // instruction, since it will also kill the components made live by this def.
1121 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1122 const bool CanKillSuperReg = KillSrc && !Overlap;
1123
1124 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1125 unsigned SubIdx;
1126 if (Forward)
1127 SubIdx = SubIndices[Idx];
1128 else
1129 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1130 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1131 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1132 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1133
1134 bool IsFirstSubreg = Idx == 0;
1135 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1136
1137 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1138 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1139 Register ImpUseSuper = SrcReg;
1140 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1141 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1142 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1144 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1146 .addReg(SrcSubReg)
1148 .addReg(SrcSubReg)
1149 .addImm(0) // op_sel_lo
1150 .addImm(0) // op_sel_hi
1151 .addImm(0) // neg_lo
1152 .addImm(0) // neg_hi
1153 .addImm(0) // clamp
1154 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1155 if (IsFirstSubreg)
1157 } else {
1158 MachineInstrBuilder Builder =
1159 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1160 if (IsFirstSubreg)
1161 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1162
1163 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1164 }
1165 }
1166}
1167
1168int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1169 int NewOpc;
1170
1171 // Try to map original to commuted opcode
1172 NewOpc = AMDGPU::getCommuteRev(Opcode);
1173 if (NewOpc != -1)
1174 // Check if the commuted (REV) opcode exists on the target.
1175 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1176
1177 // Try to map commuted to original opcode
1178 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1179 if (NewOpc != -1)
1180 // Check if the original (non-REV) opcode exists on the target.
1181 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1182
1183 return Opcode;
1184}
1185
1186const TargetRegisterClass *
1188 return &AMDGPU::VGPR_32RegClass;
1189}
1190
1193 const DebugLoc &DL, Register DstReg,
1195 Register TrueReg,
1196 Register FalseReg) const {
1197 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1198 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1200 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1201 "Not a VGPR32 reg");
1202
1203 if (Cond.size() == 1) {
1204 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1205 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1206 .add(Cond[0]);
1207 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1208 .addImm(0)
1209 .addReg(FalseReg)
1210 .addImm(0)
1211 .addReg(TrueReg)
1212 .addReg(SReg);
1213 } else if (Cond.size() == 2) {
1214 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1215 switch (Cond[0].getImm()) {
1216 case SIInstrInfo::SCC_TRUE: {
1217 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1218 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1219 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1220 .addImm(0)
1221 .addReg(FalseReg)
1222 .addImm(0)
1223 .addReg(TrueReg)
1224 .addReg(SReg);
1225 break;
1226 }
1227 case SIInstrInfo::SCC_FALSE: {
1228 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1229 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1230 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1231 .addImm(0)
1232 .addReg(FalseReg)
1233 .addImm(0)
1234 .addReg(TrueReg)
1235 .addReg(SReg);
1236 break;
1237 }
1238 case SIInstrInfo::VCCNZ: {
1239 MachineOperand RegOp = Cond[1];
1240 RegOp.setImplicit(false);
1241 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1242 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1243 .add(RegOp);
1244 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1245 .addImm(0)
1246 .addReg(FalseReg)
1247 .addImm(0)
1248 .addReg(TrueReg)
1249 .addReg(SReg);
1250 break;
1251 }
1252 case SIInstrInfo::VCCZ: {
1253 MachineOperand RegOp = Cond[1];
1254 RegOp.setImplicit(false);
1255 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1256 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1257 .add(RegOp);
1258 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1259 .addImm(0)
1260 .addReg(TrueReg)
1261 .addImm(0)
1262 .addReg(FalseReg)
1263 .addReg(SReg);
1264 break;
1265 }
1266 case SIInstrInfo::EXECNZ: {
1267 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1268 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1269 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1270 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1271 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1272 .addImm(0)
1273 .addReg(FalseReg)
1274 .addImm(0)
1275 .addReg(TrueReg)
1276 .addReg(SReg);
1277 break;
1278 }
1279 case SIInstrInfo::EXECZ: {
1280 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1281 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1282 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1283 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1284 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1285 .addImm(0)
1286 .addReg(FalseReg)
1287 .addImm(0)
1288 .addReg(TrueReg)
1289 .addReg(SReg);
1290 llvm_unreachable("Unhandled branch predicate EXECZ");
1291 break;
1292 }
1293 default:
1294 llvm_unreachable("invalid branch predicate");
1295 }
1296 } else {
1297 llvm_unreachable("Can only handle Cond size 1 or 2");
1298 }
1299}
1300
1303 const DebugLoc &DL,
1304 Register SrcReg, int Value) const {
1305 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1306 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1307 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1308 .addImm(Value)
1309 .addReg(SrcReg);
1310
1311 return Reg;
1312}
1313
1316 const DebugLoc &DL,
1317 Register SrcReg, int Value) const {
1318 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1319 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1320 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1321 .addImm(Value)
1322 .addReg(SrcReg);
1323
1324 return Reg;
1325}
1326
1328 const Register Reg,
1329 int64_t &ImmVal) const {
1330 switch (MI.getOpcode()) {
1331 case AMDGPU::V_MOV_B32_e32:
1332 case AMDGPU::S_MOV_B32:
1333 case AMDGPU::S_MOVK_I32:
1334 case AMDGPU::S_MOV_B64:
1335 case AMDGPU::V_MOV_B64_e32:
1336 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1337 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1338 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1339 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1340 case AMDGPU::V_MOV_B64_PSEUDO: {
1341 const MachineOperand &Src0 = MI.getOperand(1);
1342 if (Src0.isImm()) {
1343 ImmVal = Src0.getImm();
1344 return MI.getOperand(0).getReg() == Reg;
1345 }
1346
1347 return false;
1348 }
1349 case AMDGPU::S_BREV_B32:
1350 case AMDGPU::V_BFREV_B32_e32:
1351 case AMDGPU::V_BFREV_B32_e64: {
1352 const MachineOperand &Src0 = MI.getOperand(1);
1353 if (Src0.isImm()) {
1354 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1355 return MI.getOperand(0).getReg() == Reg;
1356 }
1357
1358 return false;
1359 }
1360 case AMDGPU::S_NOT_B32:
1361 case AMDGPU::V_NOT_B32_e32:
1362 case AMDGPU::V_NOT_B32_e64: {
1363 const MachineOperand &Src0 = MI.getOperand(1);
1364 if (Src0.isImm()) {
1365 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1366 return MI.getOperand(0).getReg() == Reg;
1367 }
1368
1369 return false;
1370 }
1371 default:
1372 return false;
1373 }
1374}
1375
1377
1378 if (RI.isAGPRClass(DstRC))
1379 return AMDGPU::COPY;
1380 if (RI.getRegSizeInBits(*DstRC) == 16) {
1381 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1382 // before RA.
1383 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1384 }
1385 if (RI.getRegSizeInBits(*DstRC) == 32)
1386 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1387 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1388 return AMDGPU::S_MOV_B64;
1389 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1390 return AMDGPU::V_MOV_B64_PSEUDO;
1391 return AMDGPU::COPY;
1392}
1393
1394const MCInstrDesc &
1396 bool IsIndirectSrc) const {
1397 if (IsIndirectSrc) {
1398 if (VecSize <= 32) // 4 bytes
1399 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1400 if (VecSize <= 64) // 8 bytes
1401 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1402 if (VecSize <= 96) // 12 bytes
1403 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1404 if (VecSize <= 128) // 16 bytes
1405 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1406 if (VecSize <= 160) // 20 bytes
1407 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1408 if (VecSize <= 256) // 32 bytes
1409 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1410 if (VecSize <= 288) // 36 bytes
1411 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1412 if (VecSize <= 320) // 40 bytes
1413 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1414 if (VecSize <= 352) // 44 bytes
1415 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1416 if (VecSize <= 384) // 48 bytes
1417 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1418 if (VecSize <= 512) // 64 bytes
1419 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1420 if (VecSize <= 1024) // 128 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1422
1423 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1424 }
1425
1426 if (VecSize <= 32) // 4 bytes
1427 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1428 if (VecSize <= 64) // 8 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1430 if (VecSize <= 96) // 12 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1432 if (VecSize <= 128) // 16 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1434 if (VecSize <= 160) // 20 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1436 if (VecSize <= 256) // 32 bytes
1437 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1438 if (VecSize <= 288) // 36 bytes
1439 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1440 if (VecSize <= 320) // 40 bytes
1441 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1442 if (VecSize <= 352) // 44 bytes
1443 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1444 if (VecSize <= 384) // 48 bytes
1445 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1446 if (VecSize <= 512) // 64 bytes
1447 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1448 if (VecSize <= 1024) // 128 bytes
1449 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1450
1451 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1452}
1453
1454static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1455 if (VecSize <= 32) // 4 bytes
1456 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1457 if (VecSize <= 64) // 8 bytes
1458 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1459 if (VecSize <= 96) // 12 bytes
1460 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1461 if (VecSize <= 128) // 16 bytes
1462 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1463 if (VecSize <= 160) // 20 bytes
1464 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1465 if (VecSize <= 256) // 32 bytes
1466 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1467 if (VecSize <= 288) // 36 bytes
1468 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1469 if (VecSize <= 320) // 40 bytes
1470 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1471 if (VecSize <= 352) // 44 bytes
1472 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1473 if (VecSize <= 384) // 48 bytes
1474 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1475 if (VecSize <= 512) // 64 bytes
1476 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1477 if (VecSize <= 1024) // 128 bytes
1478 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1479
1480 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1481}
1482
1483static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1484 if (VecSize <= 32) // 4 bytes
1485 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1486 if (VecSize <= 64) // 8 bytes
1487 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1488 if (VecSize <= 96) // 12 bytes
1489 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1490 if (VecSize <= 128) // 16 bytes
1491 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1492 if (VecSize <= 160) // 20 bytes
1493 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1494 if (VecSize <= 256) // 32 bytes
1495 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1496 if (VecSize <= 288) // 36 bytes
1497 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1498 if (VecSize <= 320) // 40 bytes
1499 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1500 if (VecSize <= 352) // 44 bytes
1501 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1502 if (VecSize <= 384) // 48 bytes
1503 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1504 if (VecSize <= 512) // 64 bytes
1505 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1506 if (VecSize <= 1024) // 128 bytes
1507 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1508
1509 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1510}
1511
1512static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1513 if (VecSize <= 64) // 8 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1515 if (VecSize <= 128) // 16 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1517 if (VecSize <= 256) // 32 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1519 if (VecSize <= 512) // 64 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1521 if (VecSize <= 1024) // 128 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1523
1524 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1525}
1526
1527const MCInstrDesc &
1528SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1529 bool IsSGPR) const {
1530 if (IsSGPR) {
1531 switch (EltSize) {
1532 case 32:
1533 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1534 case 64:
1535 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1536 default:
1537 llvm_unreachable("invalid reg indexing elt size");
1538 }
1539 }
1540
1541 assert(EltSize == 32 && "invalid reg indexing elt size");
1543}
1544
1545static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1546 switch (Size) {
1547 case 4:
1548 return AMDGPU::SI_SPILL_S32_SAVE;
1549 case 8:
1550 return AMDGPU::SI_SPILL_S64_SAVE;
1551 case 12:
1552 return AMDGPU::SI_SPILL_S96_SAVE;
1553 case 16:
1554 return AMDGPU::SI_SPILL_S128_SAVE;
1555 case 20:
1556 return AMDGPU::SI_SPILL_S160_SAVE;
1557 case 24:
1558 return AMDGPU::SI_SPILL_S192_SAVE;
1559 case 28:
1560 return AMDGPU::SI_SPILL_S224_SAVE;
1561 case 32:
1562 return AMDGPU::SI_SPILL_S256_SAVE;
1563 case 36:
1564 return AMDGPU::SI_SPILL_S288_SAVE;
1565 case 40:
1566 return AMDGPU::SI_SPILL_S320_SAVE;
1567 case 44:
1568 return AMDGPU::SI_SPILL_S352_SAVE;
1569 case 48:
1570 return AMDGPU::SI_SPILL_S384_SAVE;
1571 case 64:
1572 return AMDGPU::SI_SPILL_S512_SAVE;
1573 case 128:
1574 return AMDGPU::SI_SPILL_S1024_SAVE;
1575 default:
1576 llvm_unreachable("unknown register size");
1577 }
1578}
1579
1580static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1581 switch (Size) {
1582 case 2:
1583 return AMDGPU::SI_SPILL_V16_SAVE;
1584 case 4:
1585 return AMDGPU::SI_SPILL_V32_SAVE;
1586 case 8:
1587 return AMDGPU::SI_SPILL_V64_SAVE;
1588 case 12:
1589 return AMDGPU::SI_SPILL_V96_SAVE;
1590 case 16:
1591 return AMDGPU::SI_SPILL_V128_SAVE;
1592 case 20:
1593 return AMDGPU::SI_SPILL_V160_SAVE;
1594 case 24:
1595 return AMDGPU::SI_SPILL_V192_SAVE;
1596 case 28:
1597 return AMDGPU::SI_SPILL_V224_SAVE;
1598 case 32:
1599 return AMDGPU::SI_SPILL_V256_SAVE;
1600 case 36:
1601 return AMDGPU::SI_SPILL_V288_SAVE;
1602 case 40:
1603 return AMDGPU::SI_SPILL_V320_SAVE;
1604 case 44:
1605 return AMDGPU::SI_SPILL_V352_SAVE;
1606 case 48:
1607 return AMDGPU::SI_SPILL_V384_SAVE;
1608 case 64:
1609 return AMDGPU::SI_SPILL_V512_SAVE;
1610 case 128:
1611 return AMDGPU::SI_SPILL_V1024_SAVE;
1612 default:
1613 llvm_unreachable("unknown register size");
1614 }
1615}
1616
1617static unsigned getAVSpillSaveOpcode(unsigned Size) {
1618 switch (Size) {
1619 case 4:
1620 return AMDGPU::SI_SPILL_AV32_SAVE;
1621 case 8:
1622 return AMDGPU::SI_SPILL_AV64_SAVE;
1623 case 12:
1624 return AMDGPU::SI_SPILL_AV96_SAVE;
1625 case 16:
1626 return AMDGPU::SI_SPILL_AV128_SAVE;
1627 case 20:
1628 return AMDGPU::SI_SPILL_AV160_SAVE;
1629 case 24:
1630 return AMDGPU::SI_SPILL_AV192_SAVE;
1631 case 28:
1632 return AMDGPU::SI_SPILL_AV224_SAVE;
1633 case 32:
1634 return AMDGPU::SI_SPILL_AV256_SAVE;
1635 case 36:
1636 return AMDGPU::SI_SPILL_AV288_SAVE;
1637 case 40:
1638 return AMDGPU::SI_SPILL_AV320_SAVE;
1639 case 44:
1640 return AMDGPU::SI_SPILL_AV352_SAVE;
1641 case 48:
1642 return AMDGPU::SI_SPILL_AV384_SAVE;
1643 case 64:
1644 return AMDGPU::SI_SPILL_AV512_SAVE;
1645 case 128:
1646 return AMDGPU::SI_SPILL_AV1024_SAVE;
1647 default:
1648 llvm_unreachable("unknown register size");
1649 }
1650}
1651
1652static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1653 bool IsVectorSuperClass) {
1654 // Currently, there is only 32-bit WWM register spills needed.
1655 if (Size != 4)
1656 llvm_unreachable("unknown wwm register spill size");
1657
1658 if (IsVectorSuperClass)
1659 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1660
1661 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1662}
1663
1665 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1666 const SIMachineFunctionInfo &MFI) const {
1667 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1668
1669 // Choose the right opcode if spilling a WWM register.
1671 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1672
1673 // TODO: Check if AGPRs are available
1674 if (ST.hasMAIInsts())
1675 return getAVSpillSaveOpcode(Size);
1676
1678}
1679
1682 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1683 const TargetRegisterInfo *TRI, Register VReg,
1684 MachineInstr::MIFlag Flags) const {
1685 MachineFunction *MF = MBB.getParent();
1687 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1688 const DebugLoc &DL = MBB.findDebugLoc(MI);
1689
1690 MachinePointerInfo PtrInfo
1691 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1693 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1694 FrameInfo.getObjectAlign(FrameIndex));
1695 unsigned SpillSize = TRI->getSpillSize(*RC);
1696
1698 if (RI.isSGPRClass(RC)) {
1699 MFI->setHasSpilledSGPRs();
1700 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1701 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1702 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1703
1704 // We are only allowed to create one new instruction when spilling
1705 // registers, so we need to use pseudo instruction for spilling SGPRs.
1706 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1707
1708 // The SGPR spill/restore instructions only work on number sgprs, so we need
1709 // to make sure we are using the correct register class.
1710 if (SrcReg.isVirtual() && SpillSize == 4) {
1711 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1712 }
1713
1714 BuildMI(MBB, MI, DL, OpDesc)
1715 .addReg(SrcReg, getKillRegState(isKill)) // data
1716 .addFrameIndex(FrameIndex) // addr
1717 .addMemOperand(MMO)
1719
1720 if (RI.spillSGPRToVGPR())
1721 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1722 return;
1723 }
1724
1725 unsigned Opcode =
1726 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1727 MFI->setHasSpilledVGPRs();
1728
1729 BuildMI(MBB, MI, DL, get(Opcode))
1730 .addReg(SrcReg, getKillRegState(isKill)) // data
1731 .addFrameIndex(FrameIndex) // addr
1732 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1733 .addImm(0) // offset
1734 .addMemOperand(MMO);
1735}
1736
1737static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1738 switch (Size) {
1739 case 4:
1740 return AMDGPU::SI_SPILL_S32_RESTORE;
1741 case 8:
1742 return AMDGPU::SI_SPILL_S64_RESTORE;
1743 case 12:
1744 return AMDGPU::SI_SPILL_S96_RESTORE;
1745 case 16:
1746 return AMDGPU::SI_SPILL_S128_RESTORE;
1747 case 20:
1748 return AMDGPU::SI_SPILL_S160_RESTORE;
1749 case 24:
1750 return AMDGPU::SI_SPILL_S192_RESTORE;
1751 case 28:
1752 return AMDGPU::SI_SPILL_S224_RESTORE;
1753 case 32:
1754 return AMDGPU::SI_SPILL_S256_RESTORE;
1755 case 36:
1756 return AMDGPU::SI_SPILL_S288_RESTORE;
1757 case 40:
1758 return AMDGPU::SI_SPILL_S320_RESTORE;
1759 case 44:
1760 return AMDGPU::SI_SPILL_S352_RESTORE;
1761 case 48:
1762 return AMDGPU::SI_SPILL_S384_RESTORE;
1763 case 64:
1764 return AMDGPU::SI_SPILL_S512_RESTORE;
1765 case 128:
1766 return AMDGPU::SI_SPILL_S1024_RESTORE;
1767 default:
1768 llvm_unreachable("unknown register size");
1769 }
1770}
1771
1772static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1773 switch (Size) {
1774 case 2:
1775 return AMDGPU::SI_SPILL_V16_RESTORE;
1776 case 4:
1777 return AMDGPU::SI_SPILL_V32_RESTORE;
1778 case 8:
1779 return AMDGPU::SI_SPILL_V64_RESTORE;
1780 case 12:
1781 return AMDGPU::SI_SPILL_V96_RESTORE;
1782 case 16:
1783 return AMDGPU::SI_SPILL_V128_RESTORE;
1784 case 20:
1785 return AMDGPU::SI_SPILL_V160_RESTORE;
1786 case 24:
1787 return AMDGPU::SI_SPILL_V192_RESTORE;
1788 case 28:
1789 return AMDGPU::SI_SPILL_V224_RESTORE;
1790 case 32:
1791 return AMDGPU::SI_SPILL_V256_RESTORE;
1792 case 36:
1793 return AMDGPU::SI_SPILL_V288_RESTORE;
1794 case 40:
1795 return AMDGPU::SI_SPILL_V320_RESTORE;
1796 case 44:
1797 return AMDGPU::SI_SPILL_V352_RESTORE;
1798 case 48:
1799 return AMDGPU::SI_SPILL_V384_RESTORE;
1800 case 64:
1801 return AMDGPU::SI_SPILL_V512_RESTORE;
1802 case 128:
1803 return AMDGPU::SI_SPILL_V1024_RESTORE;
1804 default:
1805 llvm_unreachable("unknown register size");
1806 }
1807}
1808
1809static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1810 switch (Size) {
1811 case 4:
1812 return AMDGPU::SI_SPILL_AV32_RESTORE;
1813 case 8:
1814 return AMDGPU::SI_SPILL_AV64_RESTORE;
1815 case 12:
1816 return AMDGPU::SI_SPILL_AV96_RESTORE;
1817 case 16:
1818 return AMDGPU::SI_SPILL_AV128_RESTORE;
1819 case 20:
1820 return AMDGPU::SI_SPILL_AV160_RESTORE;
1821 case 24:
1822 return AMDGPU::SI_SPILL_AV192_RESTORE;
1823 case 28:
1824 return AMDGPU::SI_SPILL_AV224_RESTORE;
1825 case 32:
1826 return AMDGPU::SI_SPILL_AV256_RESTORE;
1827 case 36:
1828 return AMDGPU::SI_SPILL_AV288_RESTORE;
1829 case 40:
1830 return AMDGPU::SI_SPILL_AV320_RESTORE;
1831 case 44:
1832 return AMDGPU::SI_SPILL_AV352_RESTORE;
1833 case 48:
1834 return AMDGPU::SI_SPILL_AV384_RESTORE;
1835 case 64:
1836 return AMDGPU::SI_SPILL_AV512_RESTORE;
1837 case 128:
1838 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1839 default:
1840 llvm_unreachable("unknown register size");
1841 }
1842}
1843
1844static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1845 bool IsVectorSuperClass) {
1846 // Currently, there is only 32-bit WWM register spills needed.
1847 if (Size != 4)
1848 llvm_unreachable("unknown wwm register spill size");
1849
1850 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1851 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1852
1853 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1854}
1855
1857 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1858 const SIMachineFunctionInfo &MFI) const {
1859 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1860
1861 // Choose the right opcode if restoring a WWM register.
1863 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1864
1865 // TODO: Check if AGPRs are available
1866 if (ST.hasMAIInsts())
1868
1869 assert(!RI.isAGPRClass(RC));
1871}
1872
1875 Register DestReg, int FrameIndex,
1876 const TargetRegisterClass *RC,
1877 const TargetRegisterInfo *TRI,
1878 Register VReg,
1879 MachineInstr::MIFlag Flags) const {
1880 MachineFunction *MF = MBB.getParent();
1882 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1883 const DebugLoc &DL = MBB.findDebugLoc(MI);
1884 unsigned SpillSize = TRI->getSpillSize(*RC);
1885
1886 MachinePointerInfo PtrInfo
1887 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1888
1890 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1891 FrameInfo.getObjectAlign(FrameIndex));
1892
1893 if (RI.isSGPRClass(RC)) {
1894 MFI->setHasSpilledSGPRs();
1895 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1896 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1897 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1898
1899 // FIXME: Maybe this should not include a memoperand because it will be
1900 // lowered to non-memory instructions.
1901 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1902 if (DestReg.isVirtual() && SpillSize == 4) {
1904 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1905 }
1906
1907 if (RI.spillSGPRToVGPR())
1908 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1909 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1910 .addFrameIndex(FrameIndex) // addr
1911 .addMemOperand(MMO)
1913
1914 return;
1915 }
1916
1917 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1918 SpillSize, *MFI);
1919 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1920 .addFrameIndex(FrameIndex) // vaddr
1921 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1922 .addImm(0) // offset
1923 .addMemOperand(MMO);
1924}
1925
1930
1933 unsigned Quantity) const {
1934 DebugLoc DL = MBB.findDebugLoc(MI);
1935 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1936 while (Quantity > 0) {
1937 unsigned Arg = std::min(Quantity, MaxSNopCount);
1938 Quantity -= Arg;
1939 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1940 }
1941}
1942
1944 auto *MF = MBB.getParent();
1945 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1946
1947 assert(Info->isEntryFunction());
1948
1949 if (MBB.succ_empty()) {
1950 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1951 if (HasNoTerminator) {
1952 if (Info->returnsVoid()) {
1953 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1954 } else {
1955 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1956 }
1957 }
1958 }
1959}
1960
1964 const DebugLoc &DL) const {
1965 MachineFunction *MF = MBB.getParent();
1966 constexpr unsigned DoorbellIDMask = 0x3ff;
1967 constexpr unsigned ECQueueWaveAbort = 0x400;
1968
1969 MachineBasicBlock *TrapBB = &MBB;
1970 MachineBasicBlock *ContBB = &MBB;
1971 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1972
1973 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1974 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1975 TrapBB = MF->CreateMachineBasicBlock();
1976 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1977 MF->push_back(TrapBB);
1978 MBB.addSuccessor(TrapBB);
1979 }
1980
1981 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1982 // will be a nop.
1983 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1984 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1985 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1986 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
1987 DoorbellReg)
1989 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
1990 .addUse(AMDGPU::M0);
1991 Register DoorbellRegMasked =
1992 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1993 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
1994 .addUse(DoorbellReg)
1995 .addImm(DoorbellIDMask);
1996 Register SetWaveAbortBit =
1997 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1998 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
1999 .addUse(DoorbellRegMasked)
2000 .addImm(ECQueueWaveAbort);
2001 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2002 .addUse(SetWaveAbortBit);
2003 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2005 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2006 .addUse(AMDGPU::TTMP2);
2007 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2008 TrapBB->addSuccessor(HaltLoopBB);
2009
2010 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2011 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2012 .addMBB(HaltLoopBB);
2013 MF->push_back(HaltLoopBB);
2014 HaltLoopBB->addSuccessor(HaltLoopBB);
2015
2016 return ContBB;
2017}
2018
2020 switch (MI.getOpcode()) {
2021 default:
2022 if (MI.isMetaInstruction())
2023 return 0;
2024 return 1; // FIXME: Do wait states equal cycles?
2025
2026 case AMDGPU::S_NOP:
2027 return MI.getOperand(0).getImm() + 1;
2028 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2029 // hazard, even if one exist, won't really be visible. Should we handle it?
2030 }
2031}
2032
2034 MachineBasicBlock &MBB = *MI.getParent();
2035 DebugLoc DL = MBB.findDebugLoc(MI);
2037 switch (MI.getOpcode()) {
2038 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2039 case AMDGPU::S_MOV_B64_term:
2040 // This is only a terminator to get the correct spill code placement during
2041 // register allocation.
2042 MI.setDesc(get(AMDGPU::S_MOV_B64));
2043 break;
2044
2045 case AMDGPU::S_MOV_B32_term:
2046 // This is only a terminator to get the correct spill code placement during
2047 // register allocation.
2048 MI.setDesc(get(AMDGPU::S_MOV_B32));
2049 break;
2050
2051 case AMDGPU::S_XOR_B64_term:
2052 // This is only a terminator to get the correct spill code placement during
2053 // register allocation.
2054 MI.setDesc(get(AMDGPU::S_XOR_B64));
2055 break;
2056
2057 case AMDGPU::S_XOR_B32_term:
2058 // This is only a terminator to get the correct spill code placement during
2059 // register allocation.
2060 MI.setDesc(get(AMDGPU::S_XOR_B32));
2061 break;
2062 case AMDGPU::S_OR_B64_term:
2063 // This is only a terminator to get the correct spill code placement during
2064 // register allocation.
2065 MI.setDesc(get(AMDGPU::S_OR_B64));
2066 break;
2067 case AMDGPU::S_OR_B32_term:
2068 // This is only a terminator to get the correct spill code placement during
2069 // register allocation.
2070 MI.setDesc(get(AMDGPU::S_OR_B32));
2071 break;
2072
2073 case AMDGPU::S_ANDN2_B64_term:
2074 // This is only a terminator to get the correct spill code placement during
2075 // register allocation.
2076 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2077 break;
2078
2079 case AMDGPU::S_ANDN2_B32_term:
2080 // This is only a terminator to get the correct spill code placement during
2081 // register allocation.
2082 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2083 break;
2084
2085 case AMDGPU::S_AND_B64_term:
2086 // This is only a terminator to get the correct spill code placement during
2087 // register allocation.
2088 MI.setDesc(get(AMDGPU::S_AND_B64));
2089 break;
2090
2091 case AMDGPU::S_AND_B32_term:
2092 // This is only a terminator to get the correct spill code placement during
2093 // register allocation.
2094 MI.setDesc(get(AMDGPU::S_AND_B32));
2095 break;
2096
2097 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2098 // This is only a terminator to get the correct spill code placement during
2099 // register allocation.
2100 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2101 break;
2102
2103 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2104 // This is only a terminator to get the correct spill code placement during
2105 // register allocation.
2106 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2107 break;
2108
2109 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2110 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2111 break;
2112
2113 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2114 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2115 break;
2116 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2117 Register Dst = MI.getOperand(0).getReg();
2118 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2119 MI.setDesc(
2120 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2121 break;
2122 }
2123 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2124 Register Dst = MI.getOperand(0).getReg();
2125 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2126 int64_t Imm = MI.getOperand(1).getImm();
2127
2128 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2129 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2130 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2133 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2134 .addImm(SignExtend64<32>(Imm >> 32))
2136 MI.eraseFromParent();
2137 break;
2138 }
2139
2140 [[fallthrough]];
2141 }
2142 case AMDGPU::V_MOV_B64_PSEUDO: {
2143 Register Dst = MI.getOperand(0).getReg();
2144 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2145 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2146
2147 const MachineOperand &SrcOp = MI.getOperand(1);
2148 // FIXME: Will this work for 64-bit floating point immediates?
2149 assert(!SrcOp.isFPImm());
2150 if (ST.hasMovB64()) {
2151 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2152 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2153 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2154 break;
2155 }
2156 if (SrcOp.isImm()) {
2157 APInt Imm(64, SrcOp.getImm());
2158 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2159 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2160 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2161 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2163 .addImm(Lo.getSExtValue())
2165 .addImm(Lo.getSExtValue())
2166 .addImm(0) // op_sel_lo
2167 .addImm(0) // op_sel_hi
2168 .addImm(0) // neg_lo
2169 .addImm(0) // neg_hi
2170 .addImm(0); // clamp
2171 } else {
2172 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2173 .addImm(Lo.getSExtValue())
2175 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2176 .addImm(Hi.getSExtValue())
2178 }
2179 } else {
2180 assert(SrcOp.isReg());
2181 if (ST.hasPkMovB32() &&
2182 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2183 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2184 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2185 .addReg(SrcOp.getReg())
2187 .addReg(SrcOp.getReg())
2188 .addImm(0) // op_sel_lo
2189 .addImm(0) // op_sel_hi
2190 .addImm(0) // neg_lo
2191 .addImm(0) // neg_hi
2192 .addImm(0); // clamp
2193 } else {
2194 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2195 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2197 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2198 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2200 }
2201 }
2202 MI.eraseFromParent();
2203 break;
2204 }
2205 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2207 break;
2208 }
2209 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2210 const MachineOperand &SrcOp = MI.getOperand(1);
2211 assert(!SrcOp.isFPImm());
2212
2213 if (ST.has64BitLiterals()) {
2214 MI.setDesc(get(AMDGPU::S_MOV_B64));
2215 break;
2216 }
2217
2218 APInt Imm(64, SrcOp.getImm());
2219 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2220 MI.setDesc(get(AMDGPU::S_MOV_B64));
2221 break;
2222 }
2223
2224 Register Dst = MI.getOperand(0).getReg();
2225 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2226 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2227
2228 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2229 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2230 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2231 .addImm(Lo.getSExtValue())
2233 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2234 .addImm(Hi.getSExtValue())
2236 MI.eraseFromParent();
2237 break;
2238 }
2239 case AMDGPU::V_SET_INACTIVE_B32: {
2240 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2241 Register DstReg = MI.getOperand(0).getReg();
2242 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2243 .add(MI.getOperand(3))
2244 .add(MI.getOperand(4))
2245 .add(MI.getOperand(1))
2246 .add(MI.getOperand(2))
2247 .add(MI.getOperand(5));
2248 MI.eraseFromParent();
2249 break;
2250 }
2251 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2252 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2253 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2254 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2255 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2256 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2257 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2258 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2259 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2260 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2261 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2262 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2263 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2264 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2265 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2266 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2267 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2268 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2269 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2270 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2271 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2272 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2273 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2274 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2275 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2276 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2277 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2278 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2279 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2280 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2281
2282 unsigned Opc;
2283 if (RI.hasVGPRs(EltRC)) {
2284 Opc = AMDGPU::V_MOVRELD_B32_e32;
2285 } else {
2286 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2287 : AMDGPU::S_MOVRELD_B32;
2288 }
2289
2290 const MCInstrDesc &OpDesc = get(Opc);
2291 Register VecReg = MI.getOperand(0).getReg();
2292 bool IsUndef = MI.getOperand(1).isUndef();
2293 unsigned SubReg = MI.getOperand(3).getImm();
2294 assert(VecReg == MI.getOperand(1).getReg());
2295
2297 BuildMI(MBB, MI, DL, OpDesc)
2298 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2299 .add(MI.getOperand(2))
2301 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2302
2303 const int ImpDefIdx =
2304 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2305 const int ImpUseIdx = ImpDefIdx + 1;
2306 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2307 MI.eraseFromParent();
2308 break;
2309 }
2310 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2311 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2312 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2313 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2314 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2315 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2316 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2317 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2318 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2319 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2320 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2321 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2322 assert(ST.useVGPRIndexMode());
2323 Register VecReg = MI.getOperand(0).getReg();
2324 bool IsUndef = MI.getOperand(1).isUndef();
2325 MachineOperand &Idx = MI.getOperand(3);
2326 Register SubReg = MI.getOperand(4).getImm();
2327
2328 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2329 .add(Idx)
2331 SetOn->getOperand(3).setIsUndef();
2332
2333 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2335 BuildMI(MBB, MI, DL, OpDesc)
2336 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2337 .add(MI.getOperand(2))
2339 .addReg(VecReg,
2340 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2341
2342 const int ImpDefIdx =
2343 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2344 const int ImpUseIdx = ImpDefIdx + 1;
2345 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2346
2347 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2348
2349 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2350
2351 MI.eraseFromParent();
2352 break;
2353 }
2354 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2355 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2356 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2357 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2358 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2359 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2360 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2361 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2362 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2363 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2364 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2365 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2366 assert(ST.useVGPRIndexMode());
2367 Register Dst = MI.getOperand(0).getReg();
2368 Register VecReg = MI.getOperand(1).getReg();
2369 bool IsUndef = MI.getOperand(1).isUndef();
2370 Register Idx = MI.getOperand(2).getReg();
2371 Register SubReg = MI.getOperand(3).getImm();
2372
2373 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2374 .addReg(Idx)
2376 SetOn->getOperand(3).setIsUndef();
2377
2378 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2379 .addDef(Dst)
2380 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2381 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2382
2383 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2384
2385 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2386
2387 MI.eraseFromParent();
2388 break;
2389 }
2390 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2391 MachineFunction &MF = *MBB.getParent();
2392 Register Reg = MI.getOperand(0).getReg();
2393 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2394 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2395 MachineOperand OpLo = MI.getOperand(1);
2396 MachineOperand OpHi = MI.getOperand(2);
2397
2398 // Create a bundle so these instructions won't be re-ordered by the
2399 // post-RA scheduler.
2400 MIBundleBuilder Bundler(MBB, MI);
2401 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2402
2403 // What we want here is an offset from the value returned by s_getpc (which
2404 // is the address of the s_add_u32 instruction) to the global variable, but
2405 // since the encoding of $symbol starts 4 bytes after the start of the
2406 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2407 // small. This requires us to add 4 to the global variable offset in order
2408 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2409 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2410 // instruction.
2411
2412 int64_t Adjust = 0;
2413 if (ST.hasGetPCZeroExtension()) {
2414 // Fix up hardware that does not sign-extend the 48-bit PC value by
2415 // inserting: s_sext_i32_i16 reghi, reghi
2416 Bundler.append(
2417 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2418 Adjust += 4;
2419 }
2420
2421 if (OpLo.isGlobal())
2422 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2423 Bundler.append(
2424 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2425
2426 if (OpHi.isGlobal())
2427 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2428 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2429 .addReg(RegHi)
2430 .add(OpHi));
2431
2432 finalizeBundle(MBB, Bundler.begin());
2433
2434 MI.eraseFromParent();
2435 break;
2436 }
2437 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2438 MachineFunction &MF = *MBB.getParent();
2439 Register Reg = MI.getOperand(0).getReg();
2440 MachineOperand Op = MI.getOperand(1);
2441
2442 // Create a bundle so these instructions won't be re-ordered by the
2443 // post-RA scheduler.
2444 MIBundleBuilder Bundler(MBB, MI);
2445 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2446 if (Op.isGlobal())
2447 Op.setOffset(Op.getOffset() + 4);
2448 Bundler.append(
2449 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2450
2451 finalizeBundle(MBB, Bundler.begin());
2452
2453 MI.eraseFromParent();
2454 break;
2455 }
2456 case AMDGPU::ENTER_STRICT_WWM: {
2457 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2458 // Whole Wave Mode is entered.
2459 MI.setDesc(get(LMC.OrSaveExecOpc));
2460 break;
2461 }
2462 case AMDGPU::ENTER_STRICT_WQM: {
2463 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2464 // STRICT_WQM is entered.
2465 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2466 .addReg(LMC.ExecReg);
2467 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2468
2469 MI.eraseFromParent();
2470 break;
2471 }
2472 case AMDGPU::EXIT_STRICT_WWM:
2473 case AMDGPU::EXIT_STRICT_WQM: {
2474 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2475 // WWM/STICT_WQM is exited.
2476 MI.setDesc(get(LMC.MovOpc));
2477 break;
2478 }
2479 case AMDGPU::SI_RETURN: {
2480 const MachineFunction *MF = MBB.getParent();
2481 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2482 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2483 // Hiding the return address use with SI_RETURN may lead to extra kills in
2484 // the function and missing live-ins. We are fine in practice because callee
2485 // saved register handling ensures the register value is restored before
2486 // RET, but we need the undef flag here to appease the MachineVerifier
2487 // liveness checks.
2489 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2490 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2491
2492 MIB.copyImplicitOps(MI);
2493 MI.eraseFromParent();
2494 break;
2495 }
2496
2497 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2498 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2499 MI.setDesc(get(AMDGPU::S_MUL_U64));
2500 break;
2501
2502 case AMDGPU::S_GETPC_B64_pseudo:
2503 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2504 if (ST.hasGetPCZeroExtension()) {
2505 Register Dst = MI.getOperand(0).getReg();
2506 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2507 // Fix up hardware that does not sign-extend the 48-bit PC value by
2508 // inserting: s_sext_i32_i16 dsthi, dsthi
2509 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2510 DstHi)
2511 .addReg(DstHi);
2512 }
2513 break;
2514
2515 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2516 assert(ST.hasBF16PackedInsts());
2517 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2518 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2519 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2520 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2521 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2522 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2523 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2524 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2525 break;
2526 }
2527
2528 return true;
2529}
2530
2533 unsigned SubIdx, const MachineInstr &Orig,
2534 const TargetRegisterInfo &RI) const {
2535
2536 // Try shrinking the instruction to remat only the part needed for current
2537 // context.
2538 // TODO: Handle more cases.
2539 unsigned Opcode = Orig.getOpcode();
2540 switch (Opcode) {
2541 case AMDGPU::S_LOAD_DWORDX16_IMM:
2542 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2543 if (SubIdx != 0)
2544 break;
2545
2546 if (I == MBB.end())
2547 break;
2548
2549 if (I->isBundled())
2550 break;
2551
2552 // Look for a single use of the register that is also a subreg.
2553 Register RegToFind = Orig.getOperand(0).getReg();
2554 MachineOperand *UseMO = nullptr;
2555 for (auto &CandMO : I->operands()) {
2556 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2557 continue;
2558 if (UseMO) {
2559 UseMO = nullptr;
2560 break;
2561 }
2562 UseMO = &CandMO;
2563 }
2564 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2565 break;
2566
2567 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2568 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2569
2570 MachineFunction *MF = MBB.getParent();
2572 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2573
2574 unsigned NewOpcode = -1;
2575 if (SubregSize == 256)
2576 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2577 else if (SubregSize == 128)
2578 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2579 else
2580 break;
2581
2582 const MCInstrDesc &TID = get(NewOpcode);
2583 const TargetRegisterClass *NewRC =
2584 RI.getAllocatableClass(getRegClass(TID, 0, &RI));
2585 MRI.setRegClass(DestReg, NewRC);
2586
2587 UseMO->setReg(DestReg);
2588 UseMO->setSubReg(AMDGPU::NoSubRegister);
2589
2590 // Use a smaller load with the desired size, possibly with updated offset.
2591 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2592 MI->setDesc(TID);
2593 MI->getOperand(0).setReg(DestReg);
2594 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2595 if (Offset) {
2596 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2597 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2598 OffsetMO->setImm(FinalOffset);
2599 }
2601 for (const MachineMemOperand *MemOp : Orig.memoperands())
2602 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2603 SubregSize / 8));
2604 MI->setMemRefs(*MF, NewMMOs);
2605
2606 MBB.insert(I, MI);
2607 return;
2608 }
2609
2610 default:
2611 break;
2612 }
2613
2614 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2615}
2616
2617std::pair<MachineInstr*, MachineInstr*>
2619 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2620
2621 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2623 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2624 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2625 return std::pair(&MI, nullptr);
2626 }
2627
2628 MachineBasicBlock &MBB = *MI.getParent();
2629 DebugLoc DL = MBB.findDebugLoc(MI);
2630 MachineFunction *MF = MBB.getParent();
2632 Register Dst = MI.getOperand(0).getReg();
2633 unsigned Part = 0;
2634 MachineInstr *Split[2];
2635
2636 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2637 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2638 if (Dst.isPhysical()) {
2639 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2640 } else {
2641 assert(MRI.isSSA());
2642 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2643 MovDPP.addDef(Tmp);
2644 }
2645
2646 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2647 const MachineOperand &SrcOp = MI.getOperand(I);
2648 assert(!SrcOp.isFPImm());
2649 if (SrcOp.isImm()) {
2650 APInt Imm(64, SrcOp.getImm());
2651 Imm.ashrInPlace(Part * 32);
2652 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2653 } else {
2654 assert(SrcOp.isReg());
2655 Register Src = SrcOp.getReg();
2656 if (Src.isPhysical())
2657 MovDPP.addReg(RI.getSubReg(Src, Sub));
2658 else
2659 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2660 }
2661 }
2662
2663 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2664 MovDPP.addImm(MO.getImm());
2665
2666 Split[Part] = MovDPP;
2667 ++Part;
2668 }
2669
2670 if (Dst.isVirtual())
2671 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2672 .addReg(Split[0]->getOperand(0).getReg())
2673 .addImm(AMDGPU::sub0)
2674 .addReg(Split[1]->getOperand(0).getReg())
2675 .addImm(AMDGPU::sub1);
2676
2677 MI.eraseFromParent();
2678 return std::pair(Split[0], Split[1]);
2679}
2680
2681std::optional<DestSourcePair>
2683 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2684 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2685
2686 return std::nullopt;
2687}
2688
2690 AMDGPU::OpName Src0OpName,
2691 MachineOperand &Src1,
2692 AMDGPU::OpName Src1OpName) const {
2693 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2694 if (!Src0Mods)
2695 return false;
2696
2697 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2698 assert(Src1Mods &&
2699 "All commutable instructions have both src0 and src1 modifiers");
2700
2701 int Src0ModsVal = Src0Mods->getImm();
2702 int Src1ModsVal = Src1Mods->getImm();
2703
2704 Src1Mods->setImm(Src0ModsVal);
2705 Src0Mods->setImm(Src1ModsVal);
2706 return true;
2707}
2708
2710 MachineOperand &RegOp,
2711 MachineOperand &NonRegOp) {
2712 Register Reg = RegOp.getReg();
2713 unsigned SubReg = RegOp.getSubReg();
2714 bool IsKill = RegOp.isKill();
2715 bool IsDead = RegOp.isDead();
2716 bool IsUndef = RegOp.isUndef();
2717 bool IsDebug = RegOp.isDebug();
2718
2719 if (NonRegOp.isImm())
2720 RegOp.ChangeToImmediate(NonRegOp.getImm());
2721 else if (NonRegOp.isFI())
2722 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2723 else if (NonRegOp.isGlobal()) {
2724 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2725 NonRegOp.getTargetFlags());
2726 } else
2727 return nullptr;
2728
2729 // Make sure we don't reinterpret a subreg index in the target flags.
2730 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2731
2732 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2733 NonRegOp.setSubReg(SubReg);
2734
2735 return &MI;
2736}
2737
2739 MachineOperand &NonRegOp1,
2740 MachineOperand &NonRegOp2) {
2741 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2742 int64_t NonRegVal = NonRegOp1.getImm();
2743
2744 NonRegOp1.setImm(NonRegOp2.getImm());
2745 NonRegOp2.setImm(NonRegVal);
2746 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2747 NonRegOp2.setTargetFlags(TargetFlags);
2748 return &MI;
2749}
2750
2751bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2752 unsigned OpIdx1) const {
2753 const MCInstrDesc &InstDesc = MI.getDesc();
2754 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2755 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2756
2757 unsigned Opc = MI.getOpcode();
2758 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2759
2760 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2761 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2762
2763 // Swap doesn't breach constant bus or literal limits
2764 // It may move literal to position other than src0, this is not allowed
2765 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2766 // FIXME: After gfx9, literal can be in place other than Src0
2767 if (isVALU(MI)) {
2768 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2769 !isInlineConstant(MO0, OpInfo1))
2770 return false;
2771 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2772 !isInlineConstant(MO1, OpInfo0))
2773 return false;
2774 }
2775
2776 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2777 if (OpInfo1.RegClass == -1)
2778 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2779 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2780 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2781 }
2782 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2783 if (OpInfo0.RegClass == -1)
2784 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2785 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2786 isLegalRegOperand(MI, OpIdx0, MO1);
2787 }
2788
2789 // No need to check 64-bit literals since swapping does not bring new
2790 // 64-bit literals into current instruction to fold to 32-bit
2791
2792 return isImmOperandLegal(MI, OpIdx1, MO0);
2793}
2794
2796 unsigned Src0Idx,
2797 unsigned Src1Idx) const {
2798 assert(!NewMI && "this should never be used");
2799
2800 unsigned Opc = MI.getOpcode();
2801 int CommutedOpcode = commuteOpcode(Opc);
2802 if (CommutedOpcode == -1)
2803 return nullptr;
2804
2805 if (Src0Idx > Src1Idx)
2806 std::swap(Src0Idx, Src1Idx);
2807
2808 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2809 static_cast<int>(Src0Idx) &&
2810 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2811 static_cast<int>(Src1Idx) &&
2812 "inconsistency with findCommutedOpIndices");
2813
2814 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2815 return nullptr;
2816
2817 MachineInstr *CommutedMI = nullptr;
2818 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2819 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2820 if (Src0.isReg() && Src1.isReg()) {
2821 // Be sure to copy the source modifiers to the right place.
2822 CommutedMI =
2823 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2824 } else if (Src0.isReg() && !Src1.isReg()) {
2825 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2826 } else if (!Src0.isReg() && Src1.isReg()) {
2827 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2828 } else if (Src0.isImm() && Src1.isImm()) {
2829 CommutedMI = swapImmOperands(MI, Src0, Src1);
2830 } else {
2831 // FIXME: Found two non registers to commute. This does happen.
2832 return nullptr;
2833 }
2834
2835 if (CommutedMI) {
2836 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2837 Src1, AMDGPU::OpName::src1_modifiers);
2838
2839 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2840 AMDGPU::OpName::src1_sel);
2841
2842 CommutedMI->setDesc(get(CommutedOpcode));
2843 }
2844
2845 return CommutedMI;
2846}
2847
2848// This needs to be implemented because the source modifiers may be inserted
2849// between the true commutable operands, and the base
2850// TargetInstrInfo::commuteInstruction uses it.
2852 unsigned &SrcOpIdx0,
2853 unsigned &SrcOpIdx1) const {
2854 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2855}
2856
2858 unsigned &SrcOpIdx0,
2859 unsigned &SrcOpIdx1) const {
2860 if (!Desc.isCommutable())
2861 return false;
2862
2863 unsigned Opc = Desc.getOpcode();
2864 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2865 if (Src0Idx == -1)
2866 return false;
2867
2868 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2869 if (Src1Idx == -1)
2870 return false;
2871
2872 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2873}
2874
2876 int64_t BrOffset) const {
2877 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2878 // because its dest block is unanalyzable.
2879 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2880
2881 // Convert to dwords.
2882 BrOffset /= 4;
2883
2884 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2885 // from the next instruction.
2886 BrOffset -= 1;
2887
2888 return isIntN(BranchOffsetBits, BrOffset);
2889}
2890
2893 return MI.getOperand(0).getMBB();
2894}
2895
2897 for (const MachineInstr &MI : MBB->terminators()) {
2898 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2899 MI.getOpcode() == AMDGPU::SI_LOOP)
2900 return true;
2901 }
2902 return false;
2903}
2904
2906 MachineBasicBlock &DestBB,
2907 MachineBasicBlock &RestoreBB,
2908 const DebugLoc &DL, int64_t BrOffset,
2909 RegScavenger *RS) const {
2910 assert(MBB.empty() &&
2911 "new block should be inserted for expanding unconditional branch");
2912 assert(MBB.pred_size() == 1);
2913 assert(RestoreBB.empty() &&
2914 "restore block should be inserted for restoring clobbered registers");
2915
2916 MachineFunction *MF = MBB.getParent();
2919 auto I = MBB.end();
2920 auto &MCCtx = MF->getContext();
2921
2922 if (ST.hasAddPC64Inst()) {
2923 MCSymbol *Offset =
2924 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2925 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2927 MCSymbol *PostAddPCLabel =
2928 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2929 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2930 auto *OffsetExpr = MCBinaryExpr::createSub(
2931 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2932 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2933 Offset->setVariableValue(OffsetExpr);
2934 return;
2935 }
2936
2937 assert(RS && "RegScavenger required for long branching");
2938
2939 // FIXME: Virtual register workaround for RegScavenger not working with empty
2940 // blocks.
2941 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2942
2943 // Note: as this is used after hazard recognizer we need to apply some hazard
2944 // workarounds directly.
2945 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2946 ST.hasVALUReadSGPRHazard();
2947 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2948 if (FlushSGPRWrites)
2949 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2951 };
2952
2953 // We need to compute the offset relative to the instruction immediately after
2954 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2955 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2956 ApplyHazardWorkarounds();
2957
2958 MCSymbol *PostGetPCLabel =
2959 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2960 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2961
2962 MCSymbol *OffsetLo =
2963 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2964 MCSymbol *OffsetHi =
2965 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2966 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2967 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2968 .addReg(PCReg, 0, AMDGPU::sub0)
2969 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2970 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2971 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2972 .addReg(PCReg, 0, AMDGPU::sub1)
2973 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2974 ApplyHazardWorkarounds();
2975
2976 // Insert the indirect branch after the other terminator.
2977 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2978 .addReg(PCReg);
2979
2980 // If a spill is needed for the pc register pair, we need to insert a spill
2981 // restore block right before the destination block, and insert a short branch
2982 // into the old destination block's fallthrough predecessor.
2983 // e.g.:
2984 //
2985 // s_cbranch_scc0 skip_long_branch:
2986 //
2987 // long_branch_bb:
2988 // spill s[8:9]
2989 // s_getpc_b64 s[8:9]
2990 // s_add_u32 s8, s8, restore_bb
2991 // s_addc_u32 s9, s9, 0
2992 // s_setpc_b64 s[8:9]
2993 //
2994 // skip_long_branch:
2995 // foo;
2996 //
2997 // .....
2998 //
2999 // dest_bb_fallthrough_predecessor:
3000 // bar;
3001 // s_branch dest_bb
3002 //
3003 // restore_bb:
3004 // restore s[8:9]
3005 // fallthrough dest_bb
3006 ///
3007 // dest_bb:
3008 // buzz;
3009
3010 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3011 Register Scav;
3012
3013 // If we've previously reserved a register for long branches
3014 // avoid running the scavenger and just use those registers
3015 if (LongBranchReservedReg) {
3016 RS->enterBasicBlock(MBB);
3017 Scav = LongBranchReservedReg;
3018 } else {
3020 Scav = RS->scavengeRegisterBackwards(
3021 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3022 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3023 }
3024 if (Scav) {
3025 RS->setRegUsed(Scav);
3026 MRI.replaceRegWith(PCReg, Scav);
3027 MRI.clearVirtRegs();
3028 } else {
3029 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3030 // SGPR spill.
3031 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3032 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3033 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3034 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3035 MRI.clearVirtRegs();
3036 }
3037
3038 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3039 // Now, the distance could be defined.
3041 MCSymbolRefExpr::create(DestLabel, MCCtx),
3042 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3043 // Add offset assignments.
3044 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3045 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3046 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3047 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3048}
3049
3050unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3051 switch (Cond) {
3052 case SIInstrInfo::SCC_TRUE:
3053 return AMDGPU::S_CBRANCH_SCC1;
3054 case SIInstrInfo::SCC_FALSE:
3055 return AMDGPU::S_CBRANCH_SCC0;
3056 case SIInstrInfo::VCCNZ:
3057 return AMDGPU::S_CBRANCH_VCCNZ;
3058 case SIInstrInfo::VCCZ:
3059 return AMDGPU::S_CBRANCH_VCCZ;
3060 case SIInstrInfo::EXECNZ:
3061 return AMDGPU::S_CBRANCH_EXECNZ;
3062 case SIInstrInfo::EXECZ:
3063 return AMDGPU::S_CBRANCH_EXECZ;
3064 default:
3065 llvm_unreachable("invalid branch predicate");
3066 }
3067}
3068
3069SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3070 switch (Opcode) {
3071 case AMDGPU::S_CBRANCH_SCC0:
3072 return SCC_FALSE;
3073 case AMDGPU::S_CBRANCH_SCC1:
3074 return SCC_TRUE;
3075 case AMDGPU::S_CBRANCH_VCCNZ:
3076 return VCCNZ;
3077 case AMDGPU::S_CBRANCH_VCCZ:
3078 return VCCZ;
3079 case AMDGPU::S_CBRANCH_EXECNZ:
3080 return EXECNZ;
3081 case AMDGPU::S_CBRANCH_EXECZ:
3082 return EXECZ;
3083 default:
3084 return INVALID_BR;
3085 }
3086}
3087
3091 MachineBasicBlock *&FBB,
3093 bool AllowModify) const {
3094 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3095 // Unconditional Branch
3096 TBB = I->getOperand(0).getMBB();
3097 return false;
3098 }
3099
3100 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3101 if (Pred == INVALID_BR)
3102 return true;
3103
3104 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3105 Cond.push_back(MachineOperand::CreateImm(Pred));
3106 Cond.push_back(I->getOperand(1)); // Save the branch register.
3107
3108 ++I;
3109
3110 if (I == MBB.end()) {
3111 // Conditional branch followed by fall-through.
3112 TBB = CondBB;
3113 return false;
3114 }
3115
3116 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3117 TBB = CondBB;
3118 FBB = I->getOperand(0).getMBB();
3119 return false;
3120 }
3121
3122 return true;
3123}
3124
3126 MachineBasicBlock *&FBB,
3128 bool AllowModify) const {
3129 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3130 auto E = MBB.end();
3131 if (I == E)
3132 return false;
3133
3134 // Skip over the instructions that are artificially terminators for special
3135 // exec management.
3136 while (I != E && !I->isBranch() && !I->isReturn()) {
3137 switch (I->getOpcode()) {
3138 case AMDGPU::S_MOV_B64_term:
3139 case AMDGPU::S_XOR_B64_term:
3140 case AMDGPU::S_OR_B64_term:
3141 case AMDGPU::S_ANDN2_B64_term:
3142 case AMDGPU::S_AND_B64_term:
3143 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3144 case AMDGPU::S_MOV_B32_term:
3145 case AMDGPU::S_XOR_B32_term:
3146 case AMDGPU::S_OR_B32_term:
3147 case AMDGPU::S_ANDN2_B32_term:
3148 case AMDGPU::S_AND_B32_term:
3149 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3150 break;
3151 case AMDGPU::SI_IF:
3152 case AMDGPU::SI_ELSE:
3153 case AMDGPU::SI_KILL_I1_TERMINATOR:
3154 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3155 // FIXME: It's messy that these need to be considered here at all.
3156 return true;
3157 default:
3158 llvm_unreachable("unexpected non-branch terminator inst");
3159 }
3160
3161 ++I;
3162 }
3163
3164 if (I == E)
3165 return false;
3166
3167 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3168}
3169
3171 int *BytesRemoved) const {
3172 unsigned Count = 0;
3173 unsigned RemovedSize = 0;
3174 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3175 // Skip over artificial terminators when removing instructions.
3176 if (MI.isBranch() || MI.isReturn()) {
3177 RemovedSize += getInstSizeInBytes(MI);
3178 MI.eraseFromParent();
3179 ++Count;
3180 }
3181 }
3182
3183 if (BytesRemoved)
3184 *BytesRemoved = RemovedSize;
3185
3186 return Count;
3187}
3188
3189// Copy the flags onto the implicit condition register operand.
3191 const MachineOperand &OrigCond) {
3192 CondReg.setIsUndef(OrigCond.isUndef());
3193 CondReg.setIsKill(OrigCond.isKill());
3194}
3195
3198 MachineBasicBlock *FBB,
3200 const DebugLoc &DL,
3201 int *BytesAdded) const {
3202 if (!FBB && Cond.empty()) {
3203 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3204 .addMBB(TBB);
3205 if (BytesAdded)
3206 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3207 return 1;
3208 }
3209
3210 assert(TBB && Cond[0].isImm());
3211
3212 unsigned Opcode
3213 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3214
3215 if (!FBB) {
3216 MachineInstr *CondBr =
3217 BuildMI(&MBB, DL, get(Opcode))
3218 .addMBB(TBB);
3219
3220 // Copy the flags onto the implicit condition register operand.
3221 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3222 fixImplicitOperands(*CondBr);
3223
3224 if (BytesAdded)
3225 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3226 return 1;
3227 }
3228
3229 assert(TBB && FBB);
3230
3231 MachineInstr *CondBr =
3232 BuildMI(&MBB, DL, get(Opcode))
3233 .addMBB(TBB);
3234 fixImplicitOperands(*CondBr);
3235 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3236 .addMBB(FBB);
3237
3238 MachineOperand &CondReg = CondBr->getOperand(1);
3239 CondReg.setIsUndef(Cond[1].isUndef());
3240 CondReg.setIsKill(Cond[1].isKill());
3241
3242 if (BytesAdded)
3243 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3244
3245 return 2;
3246}
3247
3250 if (Cond.size() != 2) {
3251 return true;
3252 }
3253
3254 if (Cond[0].isImm()) {
3255 Cond[0].setImm(-Cond[0].getImm());
3256 return false;
3257 }
3258
3259 return true;
3260}
3261
3264 Register DstReg, Register TrueReg,
3265 Register FalseReg, int &CondCycles,
3266 int &TrueCycles, int &FalseCycles) const {
3267 switch (Cond[0].getImm()) {
3268 case VCCNZ:
3269 case VCCZ: {
3270 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3271 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3272 if (MRI.getRegClass(FalseReg) != RC)
3273 return false;
3274
3275 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3276 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3277
3278 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3279 return RI.hasVGPRs(RC) && NumInsts <= 6;
3280 }
3281 case SCC_TRUE:
3282 case SCC_FALSE: {
3283 // FIXME: We could insert for VGPRs if we could replace the original compare
3284 // with a vector one.
3285 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3286 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3287 if (MRI.getRegClass(FalseReg) != RC)
3288 return false;
3289
3290 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3291
3292 // Multiples of 8 can do s_cselect_b64
3293 if (NumInsts % 2 == 0)
3294 NumInsts /= 2;
3295
3296 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3297 return RI.isSGPRClass(RC);
3298 }
3299 default:
3300 return false;
3301 }
3302}
3303
3307 Register TrueReg, Register FalseReg) const {
3308 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3309 if (Pred == VCCZ || Pred == SCC_FALSE) {
3310 Pred = static_cast<BranchPredicate>(-Pred);
3311 std::swap(TrueReg, FalseReg);
3312 }
3313
3314 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3315 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3316 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3317
3318 if (DstSize == 32) {
3320 if (Pred == SCC_TRUE) {
3321 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3322 .addReg(TrueReg)
3323 .addReg(FalseReg);
3324 } else {
3325 // Instruction's operands are backwards from what is expected.
3326 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3327 .addReg(FalseReg)
3328 .addReg(TrueReg);
3329 }
3330
3331 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3332 return;
3333 }
3334
3335 if (DstSize == 64 && Pred == SCC_TRUE) {
3337 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3338 .addReg(TrueReg)
3339 .addReg(FalseReg);
3340
3341 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3342 return;
3343 }
3344
3345 static const int16_t Sub0_15[] = {
3346 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3347 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3348 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3349 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3350 };
3351
3352 static const int16_t Sub0_15_64[] = {
3353 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3354 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3355 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3356 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3357 };
3358
3359 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3360 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3361 const int16_t *SubIndices = Sub0_15;
3362 int NElts = DstSize / 32;
3363
3364 // 64-bit select is only available for SALU.
3365 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3366 if (Pred == SCC_TRUE) {
3367 if (NElts % 2) {
3368 SelOp = AMDGPU::S_CSELECT_B32;
3369 EltRC = &AMDGPU::SGPR_32RegClass;
3370 } else {
3371 SelOp = AMDGPU::S_CSELECT_B64;
3372 EltRC = &AMDGPU::SGPR_64RegClass;
3373 SubIndices = Sub0_15_64;
3374 NElts /= 2;
3375 }
3376 }
3377
3379 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3380
3381 I = MIB->getIterator();
3382
3384 for (int Idx = 0; Idx != NElts; ++Idx) {
3385 Register DstElt = MRI.createVirtualRegister(EltRC);
3386 Regs.push_back(DstElt);
3387
3388 unsigned SubIdx = SubIndices[Idx];
3389
3391 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3392 Select =
3393 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3394 .addReg(FalseReg, 0, SubIdx)
3395 .addReg(TrueReg, 0, SubIdx);
3396 } else {
3397 Select =
3398 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3399 .addReg(TrueReg, 0, SubIdx)
3400 .addReg(FalseReg, 0, SubIdx);
3401 }
3402
3403 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3405
3406 MIB.addReg(DstElt)
3407 .addImm(SubIdx);
3408 }
3409}
3410
3412 switch (MI.getOpcode()) {
3413 case AMDGPU::V_MOV_B16_t16_e32:
3414 case AMDGPU::V_MOV_B16_t16_e64:
3415 case AMDGPU::V_MOV_B32_e32:
3416 case AMDGPU::V_MOV_B32_e64:
3417 case AMDGPU::V_MOV_B64_PSEUDO:
3418 case AMDGPU::V_MOV_B64_e32:
3419 case AMDGPU::V_MOV_B64_e64:
3420 case AMDGPU::S_MOV_B32:
3421 case AMDGPU::S_MOV_B64:
3422 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3423 case AMDGPU::COPY:
3424 case AMDGPU::WWM_COPY:
3425 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3426 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3427 case AMDGPU::V_ACCVGPR_MOV_B32:
3428 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3429 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3430 return true;
3431 default:
3432 return false;
3433 }
3434}
3435
3437 switch (MI.getOpcode()) {
3438 case AMDGPU::V_MOV_B16_t16_e32:
3439 case AMDGPU::V_MOV_B16_t16_e64:
3440 return 2;
3441 case AMDGPU::V_MOV_B32_e32:
3442 case AMDGPU::V_MOV_B32_e64:
3443 case AMDGPU::V_MOV_B64_PSEUDO:
3444 case AMDGPU::V_MOV_B64_e32:
3445 case AMDGPU::V_MOV_B64_e64:
3446 case AMDGPU::S_MOV_B32:
3447 case AMDGPU::S_MOV_B64:
3448 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3449 case AMDGPU::COPY:
3450 case AMDGPU::WWM_COPY:
3451 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3452 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3453 case AMDGPU::V_ACCVGPR_MOV_B32:
3454 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3455 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3456 return 1;
3457 default:
3458 llvm_unreachable("MI is not a foldable copy");
3459 }
3460}
3461
3462static constexpr AMDGPU::OpName ModifierOpNames[] = {
3463 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3464 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3465 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3466
3468 unsigned Opc = MI.getOpcode();
3469 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3470 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3471 if (Idx >= 0)
3472 MI.removeOperand(Idx);
3473 }
3474}
3475
3476std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3477 unsigned SubRegIndex) {
3478 switch (SubRegIndex) {
3479 case AMDGPU::NoSubRegister:
3480 return Imm;
3481 case AMDGPU::sub0:
3482 return SignExtend64<32>(Imm);
3483 case AMDGPU::sub1:
3484 return SignExtend64<32>(Imm >> 32);
3485 case AMDGPU::lo16:
3486 return SignExtend64<16>(Imm);
3487 case AMDGPU::hi16:
3488 return SignExtend64<16>(Imm >> 16);
3489 case AMDGPU::sub1_lo16:
3490 return SignExtend64<16>(Imm >> 32);
3491 case AMDGPU::sub1_hi16:
3492 return SignExtend64<16>(Imm >> 48);
3493 default:
3494 return std::nullopt;
3495 }
3496
3497 llvm_unreachable("covered subregister switch");
3498}
3499
3500static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3501 switch (Opc) {
3502 case AMDGPU::V_MAC_F16_e32:
3503 case AMDGPU::V_MAC_F16_e64:
3504 case AMDGPU::V_MAD_F16_e64:
3505 return AMDGPU::V_MADAK_F16;
3506 case AMDGPU::V_MAC_F32_e32:
3507 case AMDGPU::V_MAC_F32_e64:
3508 case AMDGPU::V_MAD_F32_e64:
3509 return AMDGPU::V_MADAK_F32;
3510 case AMDGPU::V_FMAC_F32_e32:
3511 case AMDGPU::V_FMAC_F32_e64:
3512 case AMDGPU::V_FMA_F32_e64:
3513 return AMDGPU::V_FMAAK_F32;
3514 case AMDGPU::V_FMAC_F16_e32:
3515 case AMDGPU::V_FMAC_F16_e64:
3516 case AMDGPU::V_FMAC_F16_t16_e64:
3517 case AMDGPU::V_FMAC_F16_fake16_e64:
3518 case AMDGPU::V_FMA_F16_e64:
3519 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3520 ? AMDGPU::V_FMAAK_F16_t16
3521 : AMDGPU::V_FMAAK_F16_fake16
3522 : AMDGPU::V_FMAAK_F16;
3523 case AMDGPU::V_FMAC_F64_e32:
3524 case AMDGPU::V_FMAC_F64_e64:
3525 case AMDGPU::V_FMA_F64_e64:
3526 return AMDGPU::V_FMAAK_F64;
3527 default:
3528 llvm_unreachable("invalid instruction");
3529 }
3530}
3531
3532static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3533 switch (Opc) {
3534 case AMDGPU::V_MAC_F16_e32:
3535 case AMDGPU::V_MAC_F16_e64:
3536 case AMDGPU::V_MAD_F16_e64:
3537 return AMDGPU::V_MADMK_F16;
3538 case AMDGPU::V_MAC_F32_e32:
3539 case AMDGPU::V_MAC_F32_e64:
3540 case AMDGPU::V_MAD_F32_e64:
3541 return AMDGPU::V_MADMK_F32;
3542 case AMDGPU::V_FMAC_F32_e32:
3543 case AMDGPU::V_FMAC_F32_e64:
3544 case AMDGPU::V_FMA_F32_e64:
3545 return AMDGPU::V_FMAMK_F32;
3546 case AMDGPU::V_FMAC_F16_e32:
3547 case AMDGPU::V_FMAC_F16_e64:
3548 case AMDGPU::V_FMAC_F16_t16_e64:
3549 case AMDGPU::V_FMAC_F16_fake16_e64:
3550 case AMDGPU::V_FMA_F16_e64:
3551 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3552 ? AMDGPU::V_FMAMK_F16_t16
3553 : AMDGPU::V_FMAMK_F16_fake16
3554 : AMDGPU::V_FMAMK_F16;
3555 case AMDGPU::V_FMAC_F64_e32:
3556 case AMDGPU::V_FMAC_F64_e64:
3557 case AMDGPU::V_FMA_F64_e64:
3558 return AMDGPU::V_FMAMK_F64;
3559 default:
3560 llvm_unreachable("invalid instruction");
3561 }
3562}
3563
3565 Register Reg, MachineRegisterInfo *MRI) const {
3566 int64_t Imm;
3567 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3568 return false;
3569
3570 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3571
3572 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3573
3574 unsigned Opc = UseMI.getOpcode();
3575 if (Opc == AMDGPU::COPY) {
3576 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3577
3578 Register DstReg = UseMI.getOperand(0).getReg();
3579 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3580
3581 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3582
3583 if (HasMultipleUses) {
3584 // TODO: This should fold in more cases with multiple use, but we need to
3585 // more carefully consider what those uses are.
3586 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3587
3588 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3589 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3590 return false;
3591
3592 // Most of the time folding a 32-bit inline constant is free (though this
3593 // might not be true if we can't later fold it into a real user).
3594 //
3595 // FIXME: This isInlineConstant check is imprecise if
3596 // getConstValDefinedInReg handled the tricky non-mov cases.
3597 if (ImmDefSize == 32 &&
3599 return false;
3600 }
3601
3602 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3603 RI.getSubRegIdxSize(UseSubReg) == 16;
3604
3605 if (Is16Bit) {
3606 if (RI.hasVGPRs(DstRC))
3607 return false; // Do not clobber vgpr_hi16
3608
3609 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3610 return false;
3611 }
3612
3613 MachineFunction *MF = UseMI.getMF();
3614
3615 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3616 MCRegister MovDstPhysReg =
3617 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3618
3619 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3620
3621 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3622 for (unsigned MovOp :
3623 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3624 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3625 const MCInstrDesc &MovDesc = get(MovOp);
3626
3627 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI);
3628 if (Is16Bit) {
3629 // We just need to find a correctly sized register class, so the
3630 // subregister index compatibility doesn't matter since we're statically
3631 // extracting the immediate value.
3632 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3633 if (!MovDstRC)
3634 continue;
3635
3636 if (MovDstPhysReg) {
3637 // FIXME: We probably should not do this. If there is a live value in
3638 // the high half of the register, it will be corrupted.
3639 MovDstPhysReg =
3640 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3641 if (!MovDstPhysReg)
3642 continue;
3643 }
3644 }
3645
3646 // Result class isn't the right size, try the next instruction.
3647 if (MovDstPhysReg) {
3648 if (!MovDstRC->contains(MovDstPhysReg))
3649 return false;
3650 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3651 // TODO: This will be overly conservative in the case of 16-bit virtual
3652 // SGPRs. We could hack up the virtual register uses to use a compatible
3653 // 32-bit class.
3654 continue;
3655 }
3656
3657 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3658
3659 // Ensure the interpreted immediate value is a valid operand in the new
3660 // mov.
3661 //
3662 // FIXME: isImmOperandLegal should have form that doesn't require existing
3663 // MachineInstr or MachineOperand
3664 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3665 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3666 break;
3667
3668 NewOpc = MovOp;
3669 break;
3670 }
3671
3672 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3673 return false;
3674
3675 if (Is16Bit) {
3676 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3677 if (MovDstPhysReg)
3678 UseMI.getOperand(0).setReg(MovDstPhysReg);
3679 assert(UseMI.getOperand(1).getReg().isVirtual());
3680 }
3681
3682 const MCInstrDesc &NewMCID = get(NewOpc);
3683 UseMI.setDesc(NewMCID);
3684 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3685 UseMI.addImplicitDefUseOperands(*MF);
3686 return true;
3687 }
3688
3689 if (HasMultipleUses)
3690 return false;
3691
3692 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3693 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3694 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3695 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3696 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3697 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3698 Opc == AMDGPU::V_FMAC_F64_e64) {
3699 // Don't fold if we are using source or output modifiers. The new VOP2
3700 // instructions don't have them.
3702 return false;
3703
3704 // If this is a free constant, there's no reason to do this.
3705 // TODO: We could fold this here instead of letting SIFoldOperands do it
3706 // later.
3707 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3708
3709 // Any src operand can be used for the legality check.
3710 if (isInlineConstant(UseMI, Src0Idx, Imm))
3711 return false;
3712
3713 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3714
3715 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3716 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3717
3718 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3719 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3720 (Src1->isReg() && Src1->getReg() == Reg)) {
3721 MachineOperand *RegSrc =
3722 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3723 if (!RegSrc->isReg())
3724 return false;
3725 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3726 ST.getConstantBusLimit(Opc) < 2)
3727 return false;
3728
3729 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3730 return false;
3731
3732 // If src2 is also a literal constant then we have to choose which one to
3733 // fold. In general it is better to choose madak so that the other literal
3734 // can be materialized in an sgpr instead of a vgpr:
3735 // s_mov_b32 s0, literal
3736 // v_madak_f32 v0, s0, v0, literal
3737 // Instead of:
3738 // v_mov_b32 v1, literal
3739 // v_madmk_f32 v0, v0, literal, v1
3740 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3741 if (Def && Def->isMoveImmediate() &&
3742 !isInlineConstant(Def->getOperand(1)))
3743 return false;
3744
3745 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3746 if (pseudoToMCOpcode(NewOpc) == -1)
3747 return false;
3748
3749 // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3750 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3751 // restricting their register classes. For now just bail out.
3752 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3753 NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3754 return false;
3755
3756 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3757 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3758
3759 // FIXME: This would be a lot easier if we could return a new instruction
3760 // instead of having to modify in place.
3761
3762 Register SrcReg = RegSrc->getReg();
3763 unsigned SrcSubReg = RegSrc->getSubReg();
3764 Src0->setReg(SrcReg);
3765 Src0->setSubReg(SrcSubReg);
3766 Src0->setIsKill(RegSrc->isKill());
3767
3768 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3769 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3770 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3771 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3772 UseMI.untieRegOperand(
3773 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3774
3775 Src1->ChangeToImmediate(*SubRegImm);
3776
3778 UseMI.setDesc(get(NewOpc));
3779
3780 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3781 if (DeleteDef)
3782 DefMI.eraseFromParent();
3783
3784 return true;
3785 }
3786
3787 // Added part is the constant: Use v_madak_{f16, f32}.
3788 if (Src2->isReg() && Src2->getReg() == Reg) {
3789 if (ST.getConstantBusLimit(Opc) < 2) {
3790 // Not allowed to use constant bus for another operand.
3791 // We can however allow an inline immediate as src0.
3792 bool Src0Inlined = false;
3793 if (Src0->isReg()) {
3794 // Try to inline constant if possible.
3795 // If the Def moves immediate and the use is single
3796 // We are saving VGPR here.
3797 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3798 if (Def && Def->isMoveImmediate() &&
3799 isInlineConstant(Def->getOperand(1)) &&
3800 MRI->hasOneNonDBGUse(Src0->getReg())) {
3801 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3802 Src0Inlined = true;
3803 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3804 RI.isSGPRReg(*MRI, Src0->getReg())) {
3805 return false;
3806 }
3807 // VGPR is okay as Src0 - fallthrough
3808 }
3809
3810 if (Src1->isReg() && !Src0Inlined) {
3811 // We have one slot for inlinable constant so far - try to fill it
3812 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3813 if (Def && Def->isMoveImmediate() &&
3814 isInlineConstant(Def->getOperand(1)) &&
3815 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3816 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3817 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3818 return false;
3819 // VGPR is okay as Src1 - fallthrough
3820 }
3821 }
3822
3823 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3824 if (pseudoToMCOpcode(NewOpc) == -1)
3825 return false;
3826
3827 // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3828 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3829 // restricting their register classes. For now just bail out.
3830 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3831 NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3832 return false;
3833
3834 // FIXME: This would be a lot easier if we could return a new instruction
3835 // instead of having to modify in place.
3836
3837 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3838 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3839 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3840 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3841 UseMI.untieRegOperand(
3842 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3843
3844 const std::optional<int64_t> SubRegImm =
3845 extractSubregFromImm(Imm, Src2->getSubReg());
3846
3847 // ChangingToImmediate adds Src2 back to the instruction.
3848 Src2->ChangeToImmediate(*SubRegImm);
3849
3850 // These come before src2.
3852 UseMI.setDesc(get(NewOpc));
3853 // It might happen that UseMI was commuted
3854 // and we now have SGPR as SRC1. If so 2 inlined
3855 // constant and SGPR are illegal.
3857
3858 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3859 if (DeleteDef)
3860 DefMI.eraseFromParent();
3861
3862 return true;
3863 }
3864 }
3865
3866 return false;
3867}
3868
3869static bool
3872 if (BaseOps1.size() != BaseOps2.size())
3873 return false;
3874 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3875 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3876 return false;
3877 }
3878 return true;
3879}
3880
3881static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3882 LocationSize WidthB, int OffsetB) {
3883 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3884 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3885 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3886 return LowWidth.hasValue() &&
3887 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3888}
3889
3890bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3891 const MachineInstr &MIb) const {
3892 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3893 int64_t Offset0, Offset1;
3894 LocationSize Dummy0 = LocationSize::precise(0);
3895 LocationSize Dummy1 = LocationSize::precise(0);
3896 bool Offset0IsScalable, Offset1IsScalable;
3897 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3898 Dummy0, &RI) ||
3899 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3900 Dummy1, &RI))
3901 return false;
3902
3903 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3904 return false;
3905
3906 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3907 // FIXME: Handle ds_read2 / ds_write2.
3908 return false;
3909 }
3910 LocationSize Width0 = MIa.memoperands().front()->getSize();
3911 LocationSize Width1 = MIb.memoperands().front()->getSize();
3912 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3913}
3914
3916 const MachineInstr &MIb) const {
3917 assert(MIa.mayLoadOrStore() &&
3918 "MIa must load from or modify a memory location");
3919 assert(MIb.mayLoadOrStore() &&
3920 "MIb must load from or modify a memory location");
3921
3923 return false;
3924
3925 // XXX - Can we relax this between address spaces?
3926 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3927 return false;
3928
3929 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3930 return false;
3931
3932 // TODO: Should we check the address space from the MachineMemOperand? That
3933 // would allow us to distinguish objects we know don't alias based on the
3934 // underlying address space, even if it was lowered to a different one,
3935 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3936 // buffer.
3937 if (isDS(MIa)) {
3938 if (isDS(MIb))
3939 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3940
3941 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3942 }
3943
3944 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3945 if (isMUBUF(MIb) || isMTBUF(MIb))
3946 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3947
3948 if (isFLAT(MIb))
3949 return isFLATScratch(MIb);
3950
3951 return !isSMRD(MIb);
3952 }
3953
3954 if (isSMRD(MIa)) {
3955 if (isSMRD(MIb))
3956 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3957
3958 if (isFLAT(MIb))
3959 return isFLATScratch(MIb);
3960
3961 return !isMUBUF(MIb) && !isMTBUF(MIb);
3962 }
3963
3964 if (isFLAT(MIa)) {
3965 if (isFLAT(MIb)) {
3966 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3967 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3968 return true;
3969
3970 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3971 }
3972
3973 return false;
3974 }
3975
3976 return false;
3977}
3978
3980 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3981 if (Reg.isPhysical())
3982 return false;
3983 auto *Def = MRI.getUniqueVRegDef(Reg);
3984 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3985 Imm = Def->getOperand(1).getImm();
3986 if (DefMI)
3987 *DefMI = Def;
3988 return true;
3989 }
3990 return false;
3991}
3992
3993static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3994 MachineInstr **DefMI = nullptr) {
3995 if (!MO->isReg())
3996 return false;
3997 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3998 const MachineRegisterInfo &MRI = MF->getRegInfo();
3999 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4000}
4001
4003 MachineInstr &NewMI) {
4004 if (LV) {
4005 unsigned NumOps = MI.getNumOperands();
4006 for (unsigned I = 1; I < NumOps; ++I) {
4007 MachineOperand &Op = MI.getOperand(I);
4008 if (Op.isReg() && Op.isKill())
4009 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4010 }
4011 }
4012}
4013
4014static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4015 switch (Opc) {
4016 case AMDGPU::V_MAC_F16_e32:
4017 case AMDGPU::V_MAC_F16_e64:
4018 return AMDGPU::V_MAD_F16_e64;
4019 case AMDGPU::V_MAC_F32_e32:
4020 case AMDGPU::V_MAC_F32_e64:
4021 return AMDGPU::V_MAD_F32_e64;
4022 case AMDGPU::V_MAC_LEGACY_F32_e32:
4023 case AMDGPU::V_MAC_LEGACY_F32_e64:
4024 return AMDGPU::V_MAD_LEGACY_F32_e64;
4025 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4026 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4027 return AMDGPU::V_FMA_LEGACY_F32_e64;
4028 case AMDGPU::V_FMAC_F16_e32:
4029 case AMDGPU::V_FMAC_F16_e64:
4030 case AMDGPU::V_FMAC_F16_t16_e64:
4031 case AMDGPU::V_FMAC_F16_fake16_e64:
4032 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4033 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4034 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4035 : AMDGPU::V_FMA_F16_gfx9_e64;
4036 case AMDGPU::V_FMAC_F32_e32:
4037 case AMDGPU::V_FMAC_F32_e64:
4038 return AMDGPU::V_FMA_F32_e64;
4039 case AMDGPU::V_FMAC_F64_e32:
4040 case AMDGPU::V_FMAC_F64_e64:
4041 return AMDGPU::V_FMA_F64_e64;
4042 default:
4043 llvm_unreachable("invalid instruction");
4044 }
4045}
4046
4048 LiveVariables *LV,
4049 LiveIntervals *LIS) const {
4050 MachineBasicBlock &MBB = *MI.getParent();
4051 unsigned Opc = MI.getOpcode();
4052
4053 // Handle MFMA.
4054 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4055 if (NewMFMAOpc != -1) {
4057 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4058 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4059 MIB.add(MI.getOperand(I));
4060 updateLiveVariables(LV, MI, *MIB);
4061 if (LIS) {
4062 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4063 // SlotIndex of defs needs to be updated when converting to early-clobber
4064 MachineOperand &Def = MIB->getOperand(0);
4065 if (Def.isEarlyClobber() && Def.isReg() &&
4066 LIS->hasInterval(Def.getReg())) {
4067 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
4068 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
4069 auto &LI = LIS->getInterval(Def.getReg());
4070 auto UpdateDefIndex = [&](LiveRange &LR) {
4071 auto *S = LR.find(OldIndex);
4072 if (S != LR.end() && S->start == OldIndex) {
4073 assert(S->valno && S->valno->def == OldIndex);
4074 S->start = NewIndex;
4075 S->valno->def = NewIndex;
4076 }
4077 };
4078 UpdateDefIndex(LI);
4079 for (auto &SR : LI.subranges())
4080 UpdateDefIndex(SR);
4081 }
4082 }
4083 return MIB;
4084 }
4085
4086 if (SIInstrInfo::isWMMA(MI)) {
4087 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4088 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4089 .setMIFlags(MI.getFlags());
4090 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4091 MIB->addOperand(MI.getOperand(I));
4092
4093 updateLiveVariables(LV, MI, *MIB);
4094 if (LIS)
4095 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4096
4097 return MIB;
4098 }
4099
4100 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4101 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4102 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4103 "present pre-RA");
4104
4105 // Handle MAC/FMAC.
4106 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4107 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4108 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4109 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4110 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4111 bool Src0Literal = false;
4112
4113 switch (Opc) {
4114 default:
4115 return nullptr;
4116 case AMDGPU::V_MAC_F16_e64:
4117 case AMDGPU::V_FMAC_F16_e64:
4118 case AMDGPU::V_FMAC_F16_t16_e64:
4119 case AMDGPU::V_FMAC_F16_fake16_e64:
4120 case AMDGPU::V_MAC_F32_e64:
4121 case AMDGPU::V_MAC_LEGACY_F32_e64:
4122 case AMDGPU::V_FMAC_F32_e64:
4123 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4124 case AMDGPU::V_FMAC_F64_e64:
4125 break;
4126 case AMDGPU::V_MAC_F16_e32:
4127 case AMDGPU::V_FMAC_F16_e32:
4128 case AMDGPU::V_MAC_F32_e32:
4129 case AMDGPU::V_MAC_LEGACY_F32_e32:
4130 case AMDGPU::V_FMAC_F32_e32:
4131 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4132 case AMDGPU::V_FMAC_F64_e32: {
4133 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4134 AMDGPU::OpName::src0);
4135 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4136 if (!Src0->isReg() && !Src0->isImm())
4137 return nullptr;
4138
4139 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4140 Src0Literal = true;
4141
4142 break;
4143 }
4144 }
4145
4147 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4148 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4149 const MachineOperand *Src0Mods =
4150 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4151 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4152 const MachineOperand *Src1Mods =
4153 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4154 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4155 const MachineOperand *Src2Mods =
4156 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4157 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4158 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4159 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4160
4161 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4162 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4163 // If we have an SGPR input, we will violate the constant bus restriction.
4164 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4165 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4167 const auto killDef = [&]() -> void {
4168 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4169 // The only user is the instruction which will be killed.
4170 Register DefReg = DefMI->getOperand(0).getReg();
4171
4172 if (MRI.hasOneNonDBGUse(DefReg)) {
4173 // We cannot just remove the DefMI here, calling pass will crash.
4174 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
4175 DefMI->getOperand(0).setIsDead(true);
4176 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
4177 DefMI->removeOperand(I);
4178 if (LV)
4179 LV->getVarInfo(DefReg).AliveBlocks.clear();
4180 }
4181
4182 if (LIS) {
4183 LiveInterval &DefLI = LIS->getInterval(DefReg);
4184
4185 // We cannot delete the original instruction here, so hack out the use
4186 // in the original instruction with a dummy register so we can use
4187 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4188 // not have the complexity of deleting a use to consider here.
4189 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4190 for (MachineOperand &MIOp : MI.uses()) {
4191 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4192 MIOp.setIsUndef(true);
4193 MIOp.setReg(DummyReg);
4194 }
4195 }
4196
4197 LIS->shrinkToUses(&DefLI);
4198 }
4199 };
4200
4201 int64_t Imm;
4202 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4203 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4204 if (pseudoToMCOpcode(NewOpc) != -1) {
4205 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4206 .add(*Dst)
4207 .add(*Src0)
4208 .add(*Src1)
4209 .addImm(Imm)
4210 .setMIFlags(MI.getFlags());
4211 updateLiveVariables(LV, MI, *MIB);
4212 if (LIS)
4213 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4214 killDef();
4215 return MIB;
4216 }
4217 }
4218 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4219 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4220 if (pseudoToMCOpcode(NewOpc) != -1) {
4221 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4222 .add(*Dst)
4223 .add(*Src0)
4224 .addImm(Imm)
4225 .add(*Src2)
4226 .setMIFlags(MI.getFlags());
4227 updateLiveVariables(LV, MI, *MIB);
4228
4229 if (LIS)
4230 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4231 killDef();
4232 return MIB;
4233 }
4234 }
4235 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4236 if (Src0Literal) {
4237 Imm = Src0->getImm();
4238 DefMI = nullptr;
4239 }
4240 if (pseudoToMCOpcode(NewOpc) != -1 &&
4242 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4243 Src1)) {
4244 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4245 .add(*Dst)
4246 .add(*Src1)
4247 .addImm(Imm)
4248 .add(*Src2)
4249 .setMIFlags(MI.getFlags());
4250 updateLiveVariables(LV, MI, *MIB);
4251
4252 if (LIS)
4253 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4254 if (DefMI)
4255 killDef();
4256 return MIB;
4257 }
4258 }
4259 }
4260
4261 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4262 // if VOP3 does not allow a literal operand.
4263 if (Src0Literal && !ST.hasVOP3Literal())
4264 return nullptr;
4265
4266 unsigned NewOpc = getNewFMAInst(ST, Opc);
4267
4268 if (pseudoToMCOpcode(NewOpc) == -1)
4269 return nullptr;
4270
4271 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4272 .add(*Dst)
4273 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4274 .add(*Src0)
4275 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4276 .add(*Src1)
4277 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4278 .add(*Src2)
4279 .addImm(Clamp ? Clamp->getImm() : 0)
4280 .addImm(Omod ? Omod->getImm() : 0)
4281 .setMIFlags(MI.getFlags());
4282 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4283 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4284 updateLiveVariables(LV, MI, *MIB);
4285 if (LIS)
4286 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4287 return MIB;
4288}
4289
4290// It's not generally safe to move VALU instructions across these since it will
4291// start using the register as a base index rather than directly.
4292// XXX - Why isn't hasSideEffects sufficient for these?
4294 switch (MI.getOpcode()) {
4295 case AMDGPU::S_SET_GPR_IDX_ON:
4296 case AMDGPU::S_SET_GPR_IDX_MODE:
4297 case AMDGPU::S_SET_GPR_IDX_OFF:
4298 return true;
4299 default:
4300 return false;
4301 }
4302}
4303
4305 const MachineBasicBlock *MBB,
4306 const MachineFunction &MF) const {
4307 // Skipping the check for SP writes in the base implementation. The reason it
4308 // was added was apparently due to compile time concerns.
4309 //
4310 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4311 // but is probably avoidable.
4312
4313 // Copied from base implementation.
4314 // Terminators and labels can't be scheduled around.
4315 if (MI.isTerminator() || MI.isPosition())
4316 return true;
4317
4318 // INLINEASM_BR can jump to another block
4319 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4320 return true;
4321
4322 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4323 return true;
4324
4325 // Target-independent instructions do not have an implicit-use of EXEC, even
4326 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4327 // boundaries prevents incorrect movements of such instructions.
4328 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4329 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4330 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4331 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4332 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4334}
4335
4337 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4338 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4339 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4340}
4341
4343 if (!isFLAT(MI) || isFLATGlobal(MI))
4344 return false;
4345
4346 // If scratch is not initialized, we can never access it.
4347 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4348 return false;
4349
4350 // SCRATCH instructions always access scratch.
4351 if (isFLATScratch(MI))
4352 return true;
4353
4354 // If there are no memory operands then conservatively assume the flat
4355 // operation may access scratch.
4356 if (MI.memoperands_empty())
4357 return true;
4358
4359 // See if any memory operand specifies an address space that involves scratch.
4360 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4361 unsigned AS = Memop->getAddrSpace();
4362 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4363 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4364 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4365 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4366 }
4367 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4368 });
4369}
4370
4372 assert(isFLAT(MI));
4373
4374 // All flat instructions use the VMEM counter except prefetch.
4375 if (!usesVM_CNT(MI))
4376 return false;
4377
4378 // If there are no memory operands then conservatively assume the flat
4379 // operation may access VMEM.
4380 if (MI.memoperands_empty())
4381 return true;
4382
4383 // See if any memory operand specifies an address space that involves VMEM.
4384 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4385 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4386 // (GDS) address space is not supported by flat operations. Therefore, simply
4387 // return true unless only the LDS address space is found.
4388 for (const MachineMemOperand *Memop : MI.memoperands()) {
4389 unsigned AS = Memop->getAddrSpace();
4391 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4392 return true;
4393 }
4394
4395 return false;
4396}
4397
4399 assert(isFLAT(MI));
4400
4401 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4402 if (!usesLGKM_CNT(MI))
4403 return false;
4404
4405 // If in tgsplit mode then there can be no use of LDS.
4406 if (ST.isTgSplitEnabled())
4407 return false;
4408
4409 // If there are no memory operands then conservatively assume the flat
4410 // operation may access LDS.
4411 if (MI.memoperands_empty())
4412 return true;
4413
4414 // See if any memory operand specifies an address space that involves LDS.
4415 for (const MachineMemOperand *Memop : MI.memoperands()) {
4416 unsigned AS = Memop->getAddrSpace();
4418 return true;
4419 }
4420
4421 return false;
4422}
4423
4425 // Skip the full operand and register alias search modifiesRegister
4426 // does. There's only a handful of instructions that touch this, it's only an
4427 // implicit def, and doesn't alias any other registers.
4428 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4429}
4430
4432 unsigned Opcode = MI.getOpcode();
4433
4434 if (MI.mayStore() && isSMRD(MI))
4435 return true; // scalar store or atomic
4436
4437 // This will terminate the function when other lanes may need to continue.
4438 if (MI.isReturn())
4439 return true;
4440
4441 // These instructions cause shader I/O that may cause hardware lockups
4442 // when executed with an empty EXEC mask.
4443 //
4444 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4445 // EXEC = 0, but checking for that case here seems not worth it
4446 // given the typical code patterns.
4447 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4448 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4449 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4450 return true;
4451
4452 if (MI.isCall() || MI.isInlineAsm())
4453 return true; // conservative assumption
4454
4455 // Assume that barrier interactions are only intended with active lanes.
4456 if (isBarrier(Opcode))
4457 return true;
4458
4459 // A mode change is a scalar operation that influences vector instructions.
4461 return true;
4462
4463 // These are like SALU instructions in terms of effects, so it's questionable
4464 // whether we should return true for those.
4465 //
4466 // However, executing them with EXEC = 0 causes them to operate on undefined
4467 // data, which we avoid by returning true here.
4468 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4469 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4470 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4471 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4472 return true;
4473
4474 return false;
4475}
4476
4478 const MachineInstr &MI) const {
4479 if (MI.isMetaInstruction())
4480 return false;
4481
4482 // This won't read exec if this is an SGPR->SGPR copy.
4483 if (MI.isCopyLike()) {
4484 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4485 return true;
4486
4487 // Make sure this isn't copying exec as a normal operand
4488 return MI.readsRegister(AMDGPU::EXEC, &RI);
4489 }
4490
4491 // Make a conservative assumption about the callee.
4492 if (MI.isCall())
4493 return true;
4494
4495 // Be conservative with any unhandled generic opcodes.
4496 if (!isTargetSpecificOpcode(MI.getOpcode()))
4497 return true;
4498
4499 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4500}
4501
4502bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4503 switch (Imm.getBitWidth()) {
4504 case 1: // This likely will be a condition code mask.
4505 return true;
4506
4507 case 32:
4508 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4509 ST.hasInv2PiInlineImm());
4510 case 64:
4511 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4512 ST.hasInv2PiInlineImm());
4513 case 16:
4514 return ST.has16BitInsts() &&
4515 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4516 ST.hasInv2PiInlineImm());
4517 default:
4518 llvm_unreachable("invalid bitwidth");
4519 }
4520}
4521
4523 APInt IntImm = Imm.bitcastToAPInt();
4524 int64_t IntImmVal = IntImm.getSExtValue();
4525 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4526 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4527 default:
4528 llvm_unreachable("invalid fltSemantics");
4531 return isInlineConstant(IntImm);
4533 return ST.has16BitInsts() &&
4534 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4536 return ST.has16BitInsts() &&
4537 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4538 }
4539}
4540
4541bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4542 // MachineOperand provides no way to tell the true operand size, since it only
4543 // records a 64-bit value. We need to know the size to determine if a 32-bit
4544 // floating point immediate bit pattern is legal for an integer immediate. It
4545 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4546 switch (OperandType) {
4556 int32_t Trunc = static_cast<int32_t>(Imm);
4557 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4558 }
4564 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4567 // We would expect inline immediates to not be concerned with an integer/fp
4568 // distinction. However, in the case of 16-bit integer operations, the
4569 // "floating point" values appear to not work. It seems read the low 16-bits
4570 // of 32-bit immediates, which happens to always work for the integer
4571 // values.
4572 //
4573 // See llvm bugzilla 46302.
4574 //
4575 // TODO: Theoretically we could use op-sel to use the high bits of the
4576 // 32-bit FP values.
4588 return false;
4591 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4592 // A few special case instructions have 16-bit operands on subtargets
4593 // where 16-bit instructions are not legal.
4594 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4595 // constants in these cases
4596 int16_t Trunc = static_cast<int16_t>(Imm);
4597 return ST.has16BitInsts() &&
4598 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4599 }
4600
4601 return false;
4602 }
4605 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4606 int16_t Trunc = static_cast<int16_t>(Imm);
4607 return ST.has16BitInsts() &&
4608 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4609 }
4610 return false;
4611 }
4615 return false;
4617 return isLegalAV64PseudoImm(Imm);
4620 // Always embedded in the instruction for free.
4621 return true;
4631 // Just ignore anything else.
4632 return true;
4633 default:
4634 llvm_unreachable("invalid operand type");
4635 }
4636}
4637
4638static bool compareMachineOp(const MachineOperand &Op0,
4639 const MachineOperand &Op1) {
4640 if (Op0.getType() != Op1.getType())
4641 return false;
4642
4643 switch (Op0.getType()) {
4645 return Op0.getReg() == Op1.getReg();
4647 return Op0.getImm() == Op1.getImm();
4648 default:
4649 llvm_unreachable("Didn't expect to be comparing these operand types");
4650 }
4651}
4652
4654 const MCOperandInfo &OpInfo) const {
4655 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4656 return true;
4657
4658 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4659 return false;
4660
4661 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4662 return true;
4663
4664 return ST.hasVOP3Literal();
4665}
4666
4667bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4668 int64_t ImmVal) const {
4669 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4670 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4671 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4672 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4673 AMDGPU::OpName::src2))
4674 return false;
4675 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4676 }
4677
4678 return isLiteralOperandLegal(InstDesc, OpInfo);
4679}
4680
4681bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4682 const MachineOperand &MO) const {
4683 if (MO.isImm())
4684 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4685
4686 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4687 "unexpected imm-like operand kind");
4688 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4689 return isLiteralOperandLegal(InstDesc, OpInfo);
4690}
4691
4693 // 2 32-bit inline constants packed into one.
4694 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4695 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4696}
4697
4698bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4699 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4700 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4701 return false;
4702
4703 int Op32 = AMDGPU::getVOPe32(Opcode);
4704 if (Op32 == -1)
4705 return false;
4706
4707 return pseudoToMCOpcode(Op32) != -1;
4708}
4709
4710bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4711 // The src0_modifier operand is present on all instructions
4712 // that have modifiers.
4713
4714 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4715}
4716
4718 AMDGPU::OpName OpName) const {
4719 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4720 return Mods && Mods->getImm();
4721}
4722
4724 return any_of(ModifierOpNames,
4725 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4726}
4727
4729 const MachineRegisterInfo &MRI) const {
4730 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4731 // Can't shrink instruction with three operands.
4732 if (Src2) {
4733 switch (MI.getOpcode()) {
4734 default: return false;
4735
4736 case AMDGPU::V_ADDC_U32_e64:
4737 case AMDGPU::V_SUBB_U32_e64:
4738 case AMDGPU::V_SUBBREV_U32_e64: {
4739 const MachineOperand *Src1
4740 = getNamedOperand(MI, AMDGPU::OpName::src1);
4741 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4742 return false;
4743 // Additional verification is needed for sdst/src2.
4744 return true;
4745 }
4746 case AMDGPU::V_MAC_F16_e64:
4747 case AMDGPU::V_MAC_F32_e64:
4748 case AMDGPU::V_MAC_LEGACY_F32_e64:
4749 case AMDGPU::V_FMAC_F16_e64:
4750 case AMDGPU::V_FMAC_F16_t16_e64:
4751 case AMDGPU::V_FMAC_F16_fake16_e64:
4752 case AMDGPU::V_FMAC_F32_e64:
4753 case AMDGPU::V_FMAC_F64_e64:
4754 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4755 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4756 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4757 return false;
4758 break;
4759
4760 case AMDGPU::V_CNDMASK_B32_e64:
4761 break;
4762 }
4763 }
4764
4765 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4766 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4767 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4768 return false;
4769
4770 // We don't need to check src0, all input types are legal, so just make sure
4771 // src0 isn't using any modifiers.
4772 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4773 return false;
4774
4775 // Can it be shrunk to a valid 32 bit opcode?
4776 if (!hasVALU32BitEncoding(MI.getOpcode()))
4777 return false;
4778
4779 // Check output modifiers
4780 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4781 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4782 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4783 // TODO: Can we avoid checking bound_ctrl/fi here?
4784 // They are only used by permlane*_swap special case.
4785 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4786 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4787}
4788
4789// Set VCC operand with all flags from \p Orig, except for setting it as
4790// implicit.
4792 const MachineOperand &Orig) {
4793
4794 for (MachineOperand &Use : MI.implicit_operands()) {
4795 if (Use.isUse() &&
4796 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4797 Use.setIsUndef(Orig.isUndef());
4798 Use.setIsKill(Orig.isKill());
4799 return;
4800 }
4801 }
4802}
4803
4805 unsigned Op32) const {
4806 MachineBasicBlock *MBB = MI.getParent();
4807
4808 const MCInstrDesc &Op32Desc = get(Op32);
4809 MachineInstrBuilder Inst32 =
4810 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4811 .setMIFlags(MI.getFlags());
4812
4813 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4814 // For VOPC instructions, this is replaced by an implicit def of vcc.
4815
4816 // We assume the defs of the shrunk opcode are in the same order, and the
4817 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4818 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4819 Inst32.add(MI.getOperand(I));
4820
4821 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4822
4823 int Idx = MI.getNumExplicitDefs();
4824 for (const MachineOperand &Use : MI.explicit_uses()) {
4825 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4827 continue;
4828
4829 if (&Use == Src2) {
4830 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4831 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4832 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4833 // of vcc was already added during the initial BuildMI, but we
4834 // 1) may need to change vcc to vcc_lo to preserve the original register
4835 // 2) have to preserve the original flags.
4836 copyFlagsToImplicitVCC(*Inst32, *Src2);
4837 continue;
4838 }
4839 }
4840
4841 Inst32.add(Use);
4842 }
4843
4844 // FIXME: Losing implicit operands
4845 fixImplicitOperands(*Inst32);
4846 return Inst32;
4847}
4848
4850 // Null is free
4851 Register Reg = RegOp.getReg();
4852 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4853 return false;
4854
4855 // SGPRs use the constant bus
4856
4857 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
4858 // physical register operands should also count, except for exec.
4859 if (RegOp.isImplicit())
4860 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
4861
4862 // SGPRs use the constant bus
4863 return AMDGPU::SReg_32RegClass.contains(Reg) ||
4864 AMDGPU::SReg_64RegClass.contains(Reg);
4865}
4866
4868 const MachineRegisterInfo &MRI) const {
4869 Register Reg = RegOp.getReg();
4870 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4871 : physRegUsesConstantBus(RegOp);
4872}
4873
4875 const MachineOperand &MO,
4876 const MCOperandInfo &OpInfo) const {
4877 // Literal constants use the constant bus.
4878 if (!MO.isReg())
4879 return !isInlineConstant(MO, OpInfo);
4880
4881 Register Reg = MO.getReg();
4882 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4884}
4885
4887 for (const MachineOperand &MO : MI.implicit_operands()) {
4888 // We only care about reads.
4889 if (MO.isDef())
4890 continue;
4891
4892 switch (MO.getReg()) {
4893 case AMDGPU::VCC:
4894 case AMDGPU::VCC_LO:
4895 case AMDGPU::VCC_HI:
4896 case AMDGPU::M0:
4897 case AMDGPU::FLAT_SCR:
4898 return MO.getReg();
4899
4900 default:
4901 break;
4902 }
4903 }
4904
4905 return Register();
4906}
4907
4908static bool shouldReadExec(const MachineInstr &MI) {
4909 if (SIInstrInfo::isVALU(MI)) {
4910 switch (MI.getOpcode()) {
4911 case AMDGPU::V_READLANE_B32:
4912 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4913 case AMDGPU::V_WRITELANE_B32:
4914 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4915 return false;
4916 }
4917
4918 return true;
4919 }
4920
4921 if (MI.isPreISelOpcode() ||
4922 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4925 return false;
4926
4927 return true;
4928}
4929
4930static bool isRegOrFI(const MachineOperand &MO) {
4931 return MO.isReg() || MO.isFI();
4932}
4933
4934static bool isSubRegOf(const SIRegisterInfo &TRI,
4935 const MachineOperand &SuperVec,
4936 const MachineOperand &SubReg) {
4937 if (SubReg.getReg().isPhysical())
4938 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4939
4940 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4941 SubReg.getReg() == SuperVec.getReg();
4942}
4943
4944// Verify the illegal copy from vector register to SGPR for generic opcode COPY
4945bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4946 const MachineRegisterInfo &MRI,
4947 StringRef &ErrInfo) const {
4948 Register DstReg = MI.getOperand(0).getReg();
4949 Register SrcReg = MI.getOperand(1).getReg();
4950 // This is a check for copy from vector register to SGPR
4951 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
4952 ErrInfo = "illegal copy from vector register to SGPR";
4953 return false;
4954 }
4955 return true;
4956}
4957
4959 StringRef &ErrInfo) const {
4960 uint16_t Opcode = MI.getOpcode();
4961 const MachineFunction *MF = MI.getParent()->getParent();
4962 const MachineRegisterInfo &MRI = MF->getRegInfo();
4963
4964 // FIXME: At this point the COPY verify is done only for non-ssa forms.
4965 // Find a better property to recognize the point where instruction selection
4966 // is just done.
4967 // We can only enforce this check after SIFixSGPRCopies pass so that the
4968 // illegal copies are legalized and thereafter we don't expect a pass
4969 // inserting similar copies.
4970 if (!MRI.isSSA() && MI.isCopy())
4971 return verifyCopy(MI, MRI, ErrInfo);
4972
4973 if (SIInstrInfo::isGenericOpcode(Opcode))
4974 return true;
4975
4976 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4977 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4978 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4979 int Src3Idx = -1;
4980 if (Src0Idx == -1) {
4981 // VOPD V_DUAL_* instructions use different operand names.
4982 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4983 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4984 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4985 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4986 }
4987
4988 // Make sure the number of operands is correct.
4989 const MCInstrDesc &Desc = get(Opcode);
4990 if (!Desc.isVariadic() &&
4991 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4992 ErrInfo = "Instruction has wrong number of operands.";
4993 return false;
4994 }
4995
4996 if (MI.isInlineAsm()) {
4997 // Verify register classes for inlineasm constraints.
4998 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4999 I != E; ++I) {
5000 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5001 if (!RC)
5002 continue;
5003
5004 const MachineOperand &Op = MI.getOperand(I);
5005 if (!Op.isReg())
5006 continue;
5007
5008 Register Reg = Op.getReg();
5009 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5010 ErrInfo = "inlineasm operand has incorrect register class.";
5011 return false;
5012 }
5013 }
5014
5015 return true;
5016 }
5017
5018 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5019 ErrInfo = "missing memory operand from image instruction.";
5020 return false;
5021 }
5022
5023 // Make sure the register classes are correct.
5024 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5025 const MachineOperand &MO = MI.getOperand(i);
5026 if (MO.isFPImm()) {
5027 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5028 "all fp values to integers.";
5029 return false;
5030 }
5031
5032 int RegClass = Desc.operands()[i].RegClass;
5033
5034 const MCOperandInfo &OpInfo = Desc.operands()[i];
5035 switch (OpInfo.OperandType) {
5037 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5038 ErrInfo = "Illegal immediate value for operand.";
5039 return false;
5040 }
5041 break;
5054 break;
5056 break;
5057 break;
5071 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5072 ErrInfo = "Illegal immediate value for operand.";
5073 return false;
5074 }
5075 break;
5076 }
5078 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5079 ErrInfo = "Expected inline constant for operand.";
5080 return false;
5081 }
5082 break;
5086 break;
5091 // Check if this operand is an immediate.
5092 // FrameIndex operands will be replaced by immediates, so they are
5093 // allowed.
5094 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5095 ErrInfo = "Expected immediate, but got non-immediate";
5096 return false;
5097 }
5098 break;
5102 break;
5103 default:
5104 if (OpInfo.isGenericType())
5105 continue;
5106 break;
5107 }
5108
5109 if (!MO.isReg())
5110 continue;
5111 Register Reg = MO.getReg();
5112 if (!Reg)
5113 continue;
5114
5115 // FIXME: Ideally we would have separate instruction definitions with the
5116 // aligned register constraint.
5117 // FIXME: We do not verify inline asm operands, but custom inline asm
5118 // verification is broken anyway
5119 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
5120 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5121 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5122 if (const TargetRegisterClass *SubRC =
5123 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5124 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5125 if (RC)
5126 RC = SubRC;
5127 }
5128 }
5129
5130 // Check that this is the aligned version of the class.
5131 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5132 ErrInfo = "Subtarget requires even aligned vector registers";
5133 return false;
5134 }
5135 }
5136
5137 if (RegClass != -1) {
5138 if (Reg.isVirtual())
5139 continue;
5140
5141 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5142 if (!RC->contains(Reg)) {
5143 ErrInfo = "Operand has incorrect register class.";
5144 return false;
5145 }
5146 }
5147 }
5148
5149 // Verify SDWA
5150 if (isSDWA(MI)) {
5151 if (!ST.hasSDWA()) {
5152 ErrInfo = "SDWA is not supported on this target";
5153 return false;
5154 }
5155
5156 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5157 AMDGPU::OpName::dst_sel}) {
5158 const MachineOperand *MO = getNamedOperand(MI, Op);
5159 if (!MO)
5160 continue;
5161 int64_t Imm = MO->getImm();
5162 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5163 ErrInfo = "Invalid SDWA selection";
5164 return false;
5165 }
5166 }
5167
5168 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5169
5170 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5171 if (OpIdx == -1)
5172 continue;
5173 const MachineOperand &MO = MI.getOperand(OpIdx);
5174
5175 if (!ST.hasSDWAScalar()) {
5176 // Only VGPRS on VI
5177 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5178 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5179 return false;
5180 }
5181 } else {
5182 // No immediates on GFX9
5183 if (!MO.isReg()) {
5184 ErrInfo =
5185 "Only reg allowed as operands in SDWA instructions on GFX9+";
5186 return false;
5187 }
5188 }
5189 }
5190
5191 if (!ST.hasSDWAOmod()) {
5192 // No omod allowed on VI
5193 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5194 if (OMod != nullptr &&
5195 (!OMod->isImm() || OMod->getImm() != 0)) {
5196 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5197 return false;
5198 }
5199 }
5200
5201 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5202 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5203 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5204 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5205 const MachineOperand *Src0ModsMO =
5206 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5207 unsigned Mods = Src0ModsMO->getImm();
5208 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5209 Mods & SISrcMods::SEXT) {
5210 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5211 return false;
5212 }
5213 }
5214
5215 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5216 if (isVOPC(BasicOpcode)) {
5217 if (!ST.hasSDWASdst() && DstIdx != -1) {
5218 // Only vcc allowed as dst on VI for VOPC
5219 const MachineOperand &Dst = MI.getOperand(DstIdx);
5220 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5221 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5222 return false;
5223 }
5224 } else if (!ST.hasSDWAOutModsVOPC()) {
5225 // No clamp allowed on GFX9 for VOPC
5226 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5227 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5228 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5229 return false;
5230 }
5231
5232 // No omod allowed on GFX9 for VOPC
5233 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5234 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5235 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5236 return false;
5237 }
5238 }
5239 }
5240
5241 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5242 if (DstUnused && DstUnused->isImm() &&
5243 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5244 const MachineOperand &Dst = MI.getOperand(DstIdx);
5245 if (!Dst.isReg() || !Dst.isTied()) {
5246 ErrInfo = "Dst register should have tied register";
5247 return false;
5248 }
5249
5250 const MachineOperand &TiedMO =
5251 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5252 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5253 ErrInfo =
5254 "Dst register should be tied to implicit use of preserved register";
5255 return false;
5256 }
5257 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5258 ErrInfo = "Dst register should use same physical register as preserved";
5259 return false;
5260 }
5261 }
5262 }
5263
5264 // Verify MIMG / VIMAGE / VSAMPLE
5265 if (isImage(Opcode) && !MI.mayStore()) {
5266 // Ensure that the return type used is large enough for all the options
5267 // being used TFE/LWE require an extra result register.
5268 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5269 if (DMask) {
5270 uint64_t DMaskImm = DMask->getImm();
5271 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5272 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5273 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5274 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5275
5276 // Adjust for packed 16 bit values
5277 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5278 RegCount = divideCeil(RegCount, 2);
5279
5280 // Adjust if using LWE or TFE
5281 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5282 RegCount += 1;
5283
5284 const uint32_t DstIdx =
5285 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5286 const MachineOperand &Dst = MI.getOperand(DstIdx);
5287 if (Dst.isReg()) {
5288 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5289 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5290 if (RegCount > DstSize) {
5291 ErrInfo = "Image instruction returns too many registers for dst "
5292 "register class";
5293 return false;
5294 }
5295 }
5296 }
5297 }
5298
5299 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5300 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5301 unsigned ConstantBusCount = 0;
5302 bool UsesLiteral = false;
5303 const MachineOperand *LiteralVal = nullptr;
5304
5305 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5306 if (ImmIdx != -1) {
5307 ++ConstantBusCount;
5308 UsesLiteral = true;
5309 LiteralVal = &MI.getOperand(ImmIdx);
5310 }
5311
5312 SmallVector<Register, 2> SGPRsUsed;
5313 Register SGPRUsed;
5314
5315 // Only look at the true operands. Only a real operand can use the constant
5316 // bus, and we don't want to check pseudo-operands like the source modifier
5317 // flags.
5318 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5319 if (OpIdx == -1)
5320 continue;
5321 const MachineOperand &MO = MI.getOperand(OpIdx);
5322 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5323 if (MO.isReg()) {
5324 SGPRUsed = MO.getReg();
5325 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5326 ++ConstantBusCount;
5327 SGPRsUsed.push_back(SGPRUsed);
5328 }
5329 } else if (!MO.isFI()) { // Treat FI like a register.
5330 if (!UsesLiteral) {
5331 ++ConstantBusCount;
5332 UsesLiteral = true;
5333 LiteralVal = &MO;
5334 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5335 assert(isVOP2(MI) || isVOP3(MI));
5336 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5337 return false;
5338 }
5339 }
5340 }
5341 }
5342
5343 SGPRUsed = findImplicitSGPRRead(MI);
5344 if (SGPRUsed) {
5345 // Implicit uses may safely overlap true operands
5346 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5347 return !RI.regsOverlap(SGPRUsed, SGPR);
5348 })) {
5349 ++ConstantBusCount;
5350 SGPRsUsed.push_back(SGPRUsed);
5351 }
5352 }
5353
5354 // v_writelane_b32 is an exception from constant bus restriction:
5355 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5356 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5357 Opcode != AMDGPU::V_WRITELANE_B32) {
5358 ErrInfo = "VOP* instruction violates constant bus restriction";
5359 return false;
5360 }
5361
5362 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5363 ErrInfo = "VOP3 instruction uses literal";
5364 return false;
5365 }
5366 }
5367
5368 // Special case for writelane - this can break the multiple constant bus rule,
5369 // but still can't use more than one SGPR register
5370 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5371 unsigned SGPRCount = 0;
5372 Register SGPRUsed;
5373
5374 for (int OpIdx : {Src0Idx, Src1Idx}) {
5375 if (OpIdx == -1)
5376 break;
5377
5378 const MachineOperand &MO = MI.getOperand(OpIdx);
5379
5380 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5381 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5382 if (MO.getReg() != SGPRUsed)
5383 ++SGPRCount;
5384 SGPRUsed = MO.getReg();
5385 }
5386 }
5387 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5388 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5389 return false;
5390 }
5391 }
5392 }
5393
5394 // Verify misc. restrictions on specific instructions.
5395 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5396 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5397 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5398 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5399 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5400 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5401 if (!compareMachineOp(Src0, Src1) &&
5402 !compareMachineOp(Src0, Src2)) {
5403 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5404 return false;
5405 }
5406 }
5407 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5408 SISrcMods::ABS) ||
5409 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5410 SISrcMods::ABS) ||
5411 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5412 SISrcMods::ABS)) {
5413 ErrInfo = "ABS not allowed in VOP3B instructions";
5414 return false;
5415 }
5416 }
5417
5418 if (isSOP2(MI) || isSOPC(MI)) {
5419 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5420 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5421
5422 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5423 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5424 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5425 !Src0.isIdenticalTo(Src1)) {
5426 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5427 return false;
5428 }
5429 }
5430
5431 if (isSOPK(MI)) {
5432 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5433 if (Desc.isBranch()) {
5434 if (!Op->isMBB()) {
5435 ErrInfo = "invalid branch target for SOPK instruction";
5436 return false;
5437 }
5438 } else {
5439 uint64_t Imm = Op->getImm();
5440 if (sopkIsZext(Opcode)) {
5441 if (!isUInt<16>(Imm)) {
5442 ErrInfo = "invalid immediate for SOPK instruction";
5443 return false;
5444 }
5445 } else {
5446 if (!isInt<16>(Imm)) {
5447 ErrInfo = "invalid immediate for SOPK instruction";
5448 return false;
5449 }
5450 }
5451 }
5452 }
5453
5454 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5455 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5456 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5457 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5458 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5459 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5460
5461 const unsigned StaticNumOps =
5462 Desc.getNumOperands() + Desc.implicit_uses().size();
5463 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5464
5465 // Allow additional implicit operands. This allows a fixup done by the post
5466 // RA scheduler where the main implicit operand is killed and implicit-defs
5467 // are added for sub-registers that remain live after this instruction.
5468 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5469 ErrInfo = "missing implicit register operands";
5470 return false;
5471 }
5472
5473 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5474 if (IsDst) {
5475 if (!Dst->isUse()) {
5476 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5477 return false;
5478 }
5479
5480 unsigned UseOpIdx;
5481 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5482 UseOpIdx != StaticNumOps + 1) {
5483 ErrInfo = "movrel implicit operands should be tied";
5484 return false;
5485 }
5486 }
5487
5488 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5489 const MachineOperand &ImpUse
5490 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5491 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5492 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5493 ErrInfo = "src0 should be subreg of implicit vector use";
5494 return false;
5495 }
5496 }
5497
5498 // Make sure we aren't losing exec uses in the td files. This mostly requires
5499 // being careful when using let Uses to try to add other use registers.
5500 if (shouldReadExec(MI)) {
5501 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5502 ErrInfo = "VALU instruction does not implicitly read exec mask";
5503 return false;
5504 }
5505 }
5506
5507 if (isSMRD(MI)) {
5508 if (MI.mayStore() &&
5509 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5510 // The register offset form of scalar stores may only use m0 as the
5511 // soffset register.
5512 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5513 if (Soff && Soff->getReg() != AMDGPU::M0) {
5514 ErrInfo = "scalar stores must use m0 as offset register";
5515 return false;
5516 }
5517 }
5518 }
5519
5520 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5521 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5522 if (Offset->getImm() != 0) {
5523 ErrInfo = "subtarget does not support offsets in flat instructions";
5524 return false;
5525 }
5526 }
5527
5528 if (isDS(MI) && !ST.hasGDS()) {
5529 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5530 if (GDSOp && GDSOp->getImm() != 0) {
5531 ErrInfo = "GDS is not supported on this subtarget";
5532 return false;
5533 }
5534 }
5535
5536 if (isImage(MI)) {
5537 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5538 if (DimOp) {
5539 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5540 AMDGPU::OpName::vaddr0);
5541 AMDGPU::OpName RSrcOpName =
5542 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5543 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5544 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5545 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5546 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5547 const AMDGPU::MIMGDimInfo *Dim =
5549
5550 if (!Dim) {
5551 ErrInfo = "dim is out of range";
5552 return false;
5553 }
5554
5555 bool IsA16 = false;
5556 if (ST.hasR128A16()) {
5557 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5558 IsA16 = R128A16->getImm() != 0;
5559 } else if (ST.hasA16()) {
5560 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5561 IsA16 = A16->getImm() != 0;
5562 }
5563
5564 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5565
5566 unsigned AddrWords =
5567 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5568
5569 unsigned VAddrWords;
5570 if (IsNSA) {
5571 VAddrWords = RsrcIdx - VAddr0Idx;
5572 if (ST.hasPartialNSAEncoding() &&
5573 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5574 unsigned LastVAddrIdx = RsrcIdx - 1;
5575 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5576 }
5577 } else {
5578 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5579 if (AddrWords > 12)
5580 AddrWords = 16;
5581 }
5582
5583 if (VAddrWords != AddrWords) {
5584 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5585 << " but got " << VAddrWords << "\n");
5586 ErrInfo = "bad vaddr size";
5587 return false;
5588 }
5589 }
5590 }
5591
5592 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5593 if (DppCt) {
5594 using namespace AMDGPU::DPP;
5595
5596 unsigned DC = DppCt->getImm();
5597 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5598 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5599 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5600 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5601 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5602 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5603 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5604 ErrInfo = "Invalid dpp_ctrl value";
5605 return false;
5606 }
5607 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5608 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5609 ErrInfo = "Invalid dpp_ctrl value: "
5610 "wavefront shifts are not supported on GFX10+";
5611 return false;
5612 }
5613 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5614 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5615 ErrInfo = "Invalid dpp_ctrl value: "
5616 "broadcasts are not supported on GFX10+";
5617 return false;
5618 }
5619 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5620 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5621 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5622 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5623 !ST.hasGFX90AInsts()) {
5624 ErrInfo = "Invalid dpp_ctrl value: "
5625 "row_newbroadcast/row_share is not supported before "
5626 "GFX90A/GFX10";
5627 return false;
5628 }
5629 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5630 ErrInfo = "Invalid dpp_ctrl value: "
5631 "row_share and row_xmask are not supported before GFX10";
5632 return false;
5633 }
5634 }
5635
5636 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5639 ErrInfo = "Invalid dpp_ctrl value: "
5640 "DP ALU dpp only support row_newbcast";
5641 return false;
5642 }
5643 }
5644
5645 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5646 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5647 AMDGPU::OpName DataName =
5648 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5649 const MachineOperand *Data = getNamedOperand(MI, DataName);
5650 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5651 if (Data && !Data->isReg())
5652 Data = nullptr;
5653
5654 if (ST.hasGFX90AInsts()) {
5655 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5656 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5657 ErrInfo = "Invalid register class: "
5658 "vdata and vdst should be both VGPR or AGPR";
5659 return false;
5660 }
5661 if (Data && Data2 &&
5662 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5663 ErrInfo = "Invalid register class: "
5664 "both data operands should be VGPR or AGPR";
5665 return false;
5666 }
5667 } else {
5668 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5669 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5670 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5671 ErrInfo = "Invalid register class: "
5672 "agpr loads and stores not supported on this GPU";
5673 return false;
5674 }
5675 }
5676 }
5677
5678 if (ST.needsAlignedVGPRs()) {
5679 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5681 if (!Op)
5682 return true;
5683 Register Reg = Op->getReg();
5684 if (Reg.isPhysical())
5685 return !(RI.getHWRegIndex(Reg) & 1);
5686 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5687 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5688 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5689 };
5690
5691 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5692 Opcode == AMDGPU::DS_GWS_BARRIER) {
5693
5694 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5695 ErrInfo = "Subtarget requires even aligned vector registers "
5696 "for DS_GWS instructions";
5697 return false;
5698 }
5699 }
5700
5701 if (isMIMG(MI)) {
5702 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5703 ErrInfo = "Subtarget requires even aligned vector registers "
5704 "for vaddr operand of image instructions";
5705 return false;
5706 }
5707 }
5708 }
5709
5710 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5711 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5712 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5713 ErrInfo = "Invalid register class: "
5714 "v_accvgpr_write with an SGPR is not supported on this GPU";
5715 return false;
5716 }
5717 }
5718
5719 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5720 const MachineOperand &SrcOp = MI.getOperand(1);
5721 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5722 ErrInfo = "pseudo expects only physical SGPRs";
5723 return false;
5724 }
5725 }
5726
5727 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5728 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5729 if (!ST.hasScaleOffset()) {
5730 ErrInfo = "Subtarget does not support offset scaling";
5731 return false;
5732 }
5733 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5734 ErrInfo = "Instruction does not support offset scaling";
5735 return false;
5736 }
5737 }
5738 }
5739
5740 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5741 // information.
5742 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5743 for (unsigned I = 0; I < 3; ++I) {
5745 return false;
5746 }
5747 }
5748
5749 return true;
5750}
5751
5752// It is more readable to list mapped opcodes on the same line.
5753// clang-format off
5754
5756 switch (MI.getOpcode()) {
5757 default: return AMDGPU::INSTRUCTION_LIST_END;
5758 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5759 case AMDGPU::COPY: return AMDGPU::COPY;
5760 case AMDGPU::PHI: return AMDGPU::PHI;
5761 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5762 case AMDGPU::WQM: return AMDGPU::WQM;
5763 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5764 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5765 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5766 case AMDGPU::S_MOV_B32: {
5767 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5768 return MI.getOperand(1).isReg() ||
5769 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5770 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5771 }
5772 case AMDGPU::S_ADD_I32:
5773 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5774 case AMDGPU::S_ADDC_U32:
5775 return AMDGPU::V_ADDC_U32_e32;
5776 case AMDGPU::S_SUB_I32:
5777 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5778 // FIXME: These are not consistently handled, and selected when the carry is
5779 // used.
5780 case AMDGPU::S_ADD_U32:
5781 return AMDGPU::V_ADD_CO_U32_e32;
5782 case AMDGPU::S_SUB_U32:
5783 return AMDGPU::V_SUB_CO_U32_e32;
5784 case AMDGPU::S_ADD_U64_PSEUDO:
5785 return AMDGPU::V_ADD_U64_PSEUDO;
5786 case AMDGPU::S_SUB_U64_PSEUDO:
5787 return AMDGPU::V_SUB_U64_PSEUDO;
5788 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5789 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5790 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5791 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5792 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5793 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5794 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5795 case AMDGPU::S_XNOR_B32:
5796 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5797 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5798 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5799 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5800 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5801 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5802 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5803 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5804 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5805 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5806 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5807 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5808 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5809 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5810 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5811 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5812 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5813 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5814 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5815 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5816 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5817 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5818 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5819 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5820 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5821 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5822 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5823 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5824 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5825 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5826 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5827 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5828 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5829 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5830 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5831 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5832 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5833 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5834 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5835 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5836 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5837 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5838 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5839 case AMDGPU::S_CVT_F32_F16:
5840 case AMDGPU::S_CVT_HI_F32_F16:
5841 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5842 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5843 case AMDGPU::S_CVT_F16_F32:
5844 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5845 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5846 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5847 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5848 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5849 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5850 case AMDGPU::S_CEIL_F16:
5851 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5852 : AMDGPU::V_CEIL_F16_fake16_e64;
5853 case AMDGPU::S_FLOOR_F16:
5854 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5855 : AMDGPU::V_FLOOR_F16_fake16_e64;
5856 case AMDGPU::S_TRUNC_F16:
5857 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
5858 : AMDGPU::V_TRUNC_F16_fake16_e64;
5859 case AMDGPU::S_RNDNE_F16:
5860 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
5861 : AMDGPU::V_RNDNE_F16_fake16_e64;
5862 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5863 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5864 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5865 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5866 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5867 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5868 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5869 case AMDGPU::S_ADD_F16:
5870 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
5871 : AMDGPU::V_ADD_F16_fake16_e64;
5872 case AMDGPU::S_SUB_F16:
5873 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
5874 : AMDGPU::V_SUB_F16_fake16_e64;
5875 case AMDGPU::S_MIN_F16:
5876 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
5877 : AMDGPU::V_MIN_F16_fake16_e64;
5878 case AMDGPU::S_MAX_F16:
5879 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
5880 : AMDGPU::V_MAX_F16_fake16_e64;
5881 case AMDGPU::S_MINIMUM_F16:
5882 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5883 : AMDGPU::V_MINIMUM_F16_fake16_e64;
5884 case AMDGPU::S_MAXIMUM_F16:
5885 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5886 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
5887 case AMDGPU::S_MUL_F16:
5888 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
5889 : AMDGPU::V_MUL_F16_fake16_e64;
5890 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5891 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5892 case AMDGPU::S_FMAC_F16:
5893 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5894 : AMDGPU::V_FMAC_F16_fake16_e64;
5895 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5896 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5897 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5898 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5899 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5900 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5901 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5902 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5903 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5904 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5905 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5906 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5907 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5908 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5909 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5910 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5911 case AMDGPU::S_CMP_LT_F16:
5912 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5913 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5914 case AMDGPU::S_CMP_EQ_F16:
5915 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5916 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5917 case AMDGPU::S_CMP_LE_F16:
5918 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5919 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5920 case AMDGPU::S_CMP_GT_F16:
5921 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5922 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5923 case AMDGPU::S_CMP_LG_F16:
5924 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5925 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5926 case AMDGPU::S_CMP_GE_F16:
5927 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5928 : AMDGPU::V_CMP_GE_F16_fake16_e64;
5929 case AMDGPU::S_CMP_O_F16:
5930 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5931 : AMDGPU::V_CMP_O_F16_fake16_e64;
5932 case AMDGPU::S_CMP_U_F16:
5933 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5934 : AMDGPU::V_CMP_U_F16_fake16_e64;
5935 case AMDGPU::S_CMP_NGE_F16:
5936 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5937 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5938 case AMDGPU::S_CMP_NLG_F16:
5939 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5940 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5941 case AMDGPU::S_CMP_NGT_F16:
5942 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5943 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
5944 case AMDGPU::S_CMP_NLE_F16:
5945 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5946 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
5947 case AMDGPU::S_CMP_NEQ_F16:
5948 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5949 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
5950 case AMDGPU::S_CMP_NLT_F16:
5951 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5952 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
5953 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5954 case AMDGPU::V_S_EXP_F16_e64:
5955 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
5956 : AMDGPU::V_EXP_F16_fake16_e64;
5957 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5958 case AMDGPU::V_S_LOG_F16_e64:
5959 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
5960 : AMDGPU::V_LOG_F16_fake16_e64;
5961 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5962 case AMDGPU::V_S_RCP_F16_e64:
5963 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
5964 : AMDGPU::V_RCP_F16_fake16_e64;
5965 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5966 case AMDGPU::V_S_RSQ_F16_e64:
5967 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
5968 : AMDGPU::V_RSQ_F16_fake16_e64;
5969 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5970 case AMDGPU::V_S_SQRT_F16_e64:
5971 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
5972 : AMDGPU::V_SQRT_F16_fake16_e64;
5973 }
5975 "Unexpected scalar opcode without corresponding vector one!");
5976}
5977
5978// clang-format on
5979
5983 const DebugLoc &DL, Register Reg,
5984 bool IsSCCLive,
5985 SlotIndexes *Indexes) const {
5986 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5987 const SIInstrInfo *TII = ST.getInstrInfo();
5989 if (IsSCCLive) {
5990 // Insert two move instructions, one to save the original value of EXEC and
5991 // the other to turn on all bits in EXEC. This is required as we can't use
5992 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5993 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
5995 auto FlipExecMI =
5996 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
5997 if (Indexes) {
5998 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5999 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6000 }
6001 } else {
6002 auto SaveExec =
6003 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6004 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6005 if (Indexes)
6006 Indexes->insertMachineInstrInMaps(*SaveExec);
6007 }
6008}
6009
6012 const DebugLoc &DL, Register Reg,
6013 SlotIndexes *Indexes) const {
6015 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6016 .addReg(Reg, RegState::Kill);
6017 if (Indexes)
6018 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6019}
6020
6024 "Not a whole wave func");
6025 MachineBasicBlock &MBB = *MF.begin();
6026 for (MachineInstr &MI : MBB)
6027 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6028 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6029 return &MI;
6030
6031 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6032}
6033
6034static const TargetRegisterClass *
6036 const MCInstrDesc &TID, unsigned RCID) {
6037 if (!ST.hasGFX90AInsts() && (TID.mayLoad() || TID.mayStore())) {
6038 switch (RCID) {
6039 case AMDGPU::AV_32RegClassID:
6040 RCID = AMDGPU::VGPR_32RegClassID;
6041 break;
6042 case AMDGPU::AV_64RegClassID:
6043 RCID = AMDGPU::VReg_64RegClassID;
6044 break;
6045 case AMDGPU::AV_96RegClassID:
6046 RCID = AMDGPU::VReg_96RegClassID;
6047 break;
6048 case AMDGPU::AV_128RegClassID:
6049 RCID = AMDGPU::VReg_128RegClassID;
6050 break;
6051 case AMDGPU::AV_160RegClassID:
6052 RCID = AMDGPU::VReg_160RegClassID;
6053 break;
6054 case AMDGPU::AV_512RegClassID:
6055 RCID = AMDGPU::VReg_512RegClassID;
6056 break;
6057 default:
6058 break;
6059 }
6060 }
6061
6062 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
6063}
6064
6065const TargetRegisterClass *
6066SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum,
6067 const TargetRegisterInfo *TRI) const {
6068 if (OpNum >= TID.getNumOperands())
6069 return nullptr;
6070 auto RegClass = TID.operands()[OpNum].RegClass;
6071 // Special pseudos have no alignment requirement.
6072 if (TID.getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO || isSpill(TID))
6073 return RI.getRegClass(RegClass);
6074
6075 return adjustAllocatableRegClass(ST, RI, TID, RegClass);
6076}
6077
6079 unsigned OpNo) const {
6080 const MCInstrDesc &Desc = get(MI.getOpcode());
6081 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6082 Desc.operands()[OpNo].RegClass == -1) {
6083 Register Reg = MI.getOperand(OpNo).getReg();
6084
6085 if (Reg.isVirtual()) {
6086 const MachineRegisterInfo &MRI =
6087 MI.getParent()->getParent()->getRegInfo();
6088 return MRI.getRegClass(Reg);
6089 }
6090 return RI.getPhysRegBaseClass(Reg);
6091 }
6092
6093 unsigned RCID = Desc.operands()[OpNo].RegClass;
6094 return adjustAllocatableRegClass(ST, RI, Desc, RCID);
6095}
6096
6099 MachineBasicBlock *MBB = MI.getParent();
6100 MachineOperand &MO = MI.getOperand(OpIdx);
6101 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6102 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
6103 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6104 unsigned Size = RI.getRegSizeInBits(*RC);
6105 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6106 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6107 : AMDGPU::V_MOV_B32_e32;
6108 if (MO.isReg())
6109 Opcode = AMDGPU::COPY;
6110 else if (RI.isSGPRClass(RC))
6111 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6112
6113 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6114 Register Reg = MRI.createVirtualRegister(VRC);
6115 DebugLoc DL = MBB->findDebugLoc(I);
6116 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6117 MO.ChangeToRegister(Reg, false);
6118}
6119
6122 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6123 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6124 if (!SuperReg.getReg().isVirtual())
6125 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6126
6127 MachineBasicBlock *MBB = MI->getParent();
6128 const DebugLoc &DL = MI->getDebugLoc();
6129 Register SubReg = MRI.createVirtualRegister(SubRC);
6130
6131 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6132 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6133 .addReg(SuperReg.getReg(), 0, NewSubIdx);
6134 return SubReg;
6135}
6136
6139 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6140 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6141 if (Op.isImm()) {
6142 if (SubIdx == AMDGPU::sub0)
6143 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6144 if (SubIdx == AMDGPU::sub1)
6145 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6146
6147 llvm_unreachable("Unhandled register index for immediate");
6148 }
6149
6150 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6151 SubIdx, SubRC);
6152 return MachineOperand::CreateReg(SubReg, false);
6153}
6154
6155// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6156void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6157 assert(Inst.getNumExplicitOperands() == 3);
6158 MachineOperand Op1 = Inst.getOperand(1);
6159 Inst.removeOperand(1);
6160 Inst.addOperand(Op1);
6161}
6162
6164 const MCOperandInfo &OpInfo,
6165 const MachineOperand &MO) const {
6166 if (!MO.isReg())
6167 return false;
6168
6169 Register Reg = MO.getReg();
6170
6171 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
6172 if (Reg.isPhysical())
6173 return DRC->contains(Reg);
6174
6175 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6176
6177 if (MO.getSubReg()) {
6178 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
6179 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6180 if (!SuperRC)
6181 return false;
6182 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6183 }
6184
6185 return RI.getCommonSubClass(DRC, RC) != nullptr;
6186}
6187
6189 const MachineOperand &MO) const {
6190 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6191 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6192 unsigned Opc = MI.getOpcode();
6193
6194 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6195 // information.
6196 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6197 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6198 constexpr const AMDGPU::OpName OpNames[] = {
6199 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6200
6201 for (auto [I, OpName] : enumerate(OpNames)) {
6202 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6203 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6205 return false;
6206 }
6207 }
6208
6209 if (!isLegalRegOperand(MRI, OpInfo, MO))
6210 return false;
6211
6212 // check Accumulate GPR operand
6213 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6214 if (IsAGPR && !ST.hasMAIInsts())
6215 return false;
6216 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6217 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6218 return false;
6219 // Atomics should have both vdst and vdata either vgpr or agpr.
6220 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6221 const int DataIdx = AMDGPU::getNamedOperandIdx(
6222 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6223 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6224 MI.getOperand(DataIdx).isReg() &&
6225 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6226 return false;
6227 if ((int)OpIdx == DataIdx) {
6228 if (VDstIdx != -1 &&
6229 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6230 return false;
6231 // DS instructions with 2 src operands also must have tied RC.
6232 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6233 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6234 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6235 return false;
6236 }
6237
6238 // Check V_ACCVGPR_WRITE_B32_e64
6239 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6240 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6241 RI.isSGPRReg(MRI, MO.getReg()))
6242 return false;
6243 return true;
6244}
6245
6247 const MCOperandInfo &OpInfo,
6248 const MachineOperand &MO) const {
6249 if (MO.isReg())
6250 return isLegalRegOperand(MRI, OpInfo, MO);
6251
6252 // Handle non-register types that are treated like immediates.
6253 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6254 return true;
6255}
6256
6258 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6259 const MachineOperand *MO) const {
6260 constexpr const unsigned NumOps = 3;
6261 constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
6262 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6263 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6264 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6265
6266 assert(SrcN < NumOps);
6267
6268 if (!MO) {
6269 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6270 if (SrcIdx == -1)
6271 return true;
6272 MO = &MI.getOperand(SrcIdx);
6273 }
6274
6275 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6276 return true;
6277
6278 int ModsIdx =
6279 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6280 if (ModsIdx == -1)
6281 return true;
6282
6283 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6284 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6285 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6286
6287 return !OpSel && !OpSelHi;
6288}
6289
6291 const MachineOperand *MO) const {
6292 const MachineFunction &MF = *MI.getParent()->getParent();
6293 const MachineRegisterInfo &MRI = MF.getRegInfo();
6294 const MCInstrDesc &InstDesc = MI.getDesc();
6295 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6296 const TargetRegisterClass *DefinedRC =
6297 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
6298 if (!MO)
6299 MO = &MI.getOperand(OpIdx);
6300
6301 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6302
6303 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6304 const MachineOperand *UsedLiteral = nullptr;
6305
6306 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6307 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6308
6309 // TODO: Be more permissive with frame indexes.
6310 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6311 if (!LiteralLimit--)
6312 return false;
6313
6314 UsedLiteral = MO;
6315 }
6316
6318 if (MO->isReg())
6319 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6320
6321 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6322 if (i == OpIdx)
6323 continue;
6324 const MachineOperand &Op = MI.getOperand(i);
6325 if (Op.isReg()) {
6326 if (Op.isUse()) {
6327 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6328 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6329 if (--ConstantBusLimit <= 0)
6330 return false;
6331 }
6332 }
6333 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6334 !isInlineConstant(Op, InstDesc.operands()[i])) {
6335 // The same literal may be used multiple times.
6336 if (!UsedLiteral)
6337 UsedLiteral = &Op;
6338 else if (UsedLiteral->isIdenticalTo(Op))
6339 continue;
6340
6341 if (!LiteralLimit--)
6342 return false;
6343 if (--ConstantBusLimit <= 0)
6344 return false;
6345 }
6346 }
6347 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6348 // There can be at most one literal operand, but it can be repeated.
6349 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6350 if (i == OpIdx)
6351 continue;
6352 const MachineOperand &Op = MI.getOperand(i);
6353 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6354 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6355 !Op.isIdenticalTo(*MO))
6356 return false;
6357
6358 // Do not fold a non-inlineable and non-register operand into an
6359 // instruction that already has a frame index. The frame index handling
6360 // code could not handle well when a frame index co-exists with another
6361 // non-register operand, unless that operand is an inlineable immediate.
6362 if (Op.isFI())
6363 return false;
6364 }
6365 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6366 isF16PseudoScalarTrans(MI.getOpcode())) {
6367 return false;
6368 }
6369
6370 if (MO->isReg()) {
6371 if (!DefinedRC)
6372 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6373 return isLegalRegOperand(MI, OpIdx, *MO);
6374 }
6375
6376 if (MO->isImm()) {
6377 uint64_t Imm = MO->getImm();
6378 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6379 bool Is64BitOp = Is64BitFPOp ||
6380 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6381 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6382 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6383 if (Is64BitOp &&
6384 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6385 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6386 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6387 return false;
6388
6389 // FIXME: We can use sign extended 64-bit literals, but only for signed
6390 // operands. At the moment we do not know if an operand is signed.
6391 // Such operand will be encoded as its low 32 bits and then either
6392 // correctly sign extended or incorrectly zero extended by HW.
6393 // If 64-bit literals are supported and the literal will be encoded
6394 // as full 64 bit we still can use it.
6395 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6396 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6397 return false;
6398 }
6399 }
6400
6401 // Handle non-register types that are treated like immediates.
6402 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6403
6404 if (!DefinedRC) {
6405 // This operand expects an immediate.
6406 return true;
6407 }
6408
6409 return isImmOperandLegal(MI, OpIdx, *MO);
6410}
6411
6413 bool IsGFX950Only = ST.hasGFX950Insts();
6414 bool IsGFX940Only = ST.hasGFX940Insts();
6415
6416 if (!IsGFX950Only && !IsGFX940Only)
6417 return false;
6418
6419 if (!isVALU(MI))
6420 return false;
6421
6422 // V_COS, V_EXP, V_RCP, etc.
6423 if (isTRANS(MI))
6424 return true;
6425
6426 // DOT2, DOT2C, DOT4, etc.
6427 if (isDOT(MI))
6428 return true;
6429
6430 // MFMA, SMFMA
6431 if (isMFMA(MI))
6432 return true;
6433
6434 unsigned Opcode = MI.getOpcode();
6435 switch (Opcode) {
6436 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6437 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6438 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6439 case AMDGPU::V_MQSAD_U32_U8_e64:
6440 case AMDGPU::V_PK_ADD_F16:
6441 case AMDGPU::V_PK_ADD_F32:
6442 case AMDGPU::V_PK_ADD_I16:
6443 case AMDGPU::V_PK_ADD_U16:
6444 case AMDGPU::V_PK_ASHRREV_I16:
6445 case AMDGPU::V_PK_FMA_F16:
6446 case AMDGPU::V_PK_FMA_F32:
6447 case AMDGPU::V_PK_FMAC_F16_e32:
6448 case AMDGPU::V_PK_FMAC_F16_e64:
6449 case AMDGPU::V_PK_LSHLREV_B16:
6450 case AMDGPU::V_PK_LSHRREV_B16:
6451 case AMDGPU::V_PK_MAD_I16:
6452 case AMDGPU::V_PK_MAD_U16:
6453 case AMDGPU::V_PK_MAX_F16:
6454 case AMDGPU::V_PK_MAX_I16:
6455 case AMDGPU::V_PK_MAX_U16:
6456 case AMDGPU::V_PK_MIN_F16:
6457 case AMDGPU::V_PK_MIN_I16:
6458 case AMDGPU::V_PK_MIN_U16:
6459 case AMDGPU::V_PK_MOV_B32:
6460 case AMDGPU::V_PK_MUL_F16:
6461 case AMDGPU::V_PK_MUL_F32:
6462 case AMDGPU::V_PK_MUL_LO_U16:
6463 case AMDGPU::V_PK_SUB_I16:
6464 case AMDGPU::V_PK_SUB_U16:
6465 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6466 return true;
6467 default:
6468 return false;
6469 }
6470}
6471
6473 MachineInstr &MI) const {
6474 unsigned Opc = MI.getOpcode();
6475 const MCInstrDesc &InstrDesc = get(Opc);
6476
6477 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6478 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6479
6480 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6481 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6482
6483 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6484 // we need to only have one constant bus use before GFX10.
6485 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6486 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6487 RI.isSGPRReg(MRI, Src0.getReg()))
6488 legalizeOpWithMove(MI, Src0Idx);
6489
6490 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6491 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6492 // src0/src1 with V_READFIRSTLANE.
6493 if (Opc == AMDGPU::V_WRITELANE_B32) {
6494 const DebugLoc &DL = MI.getDebugLoc();
6495 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6496 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6497 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6498 .add(Src0);
6499 Src0.ChangeToRegister(Reg, false);
6500 }
6501 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6502 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6503 const DebugLoc &DL = MI.getDebugLoc();
6504 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6505 .add(Src1);
6506 Src1.ChangeToRegister(Reg, false);
6507 }
6508 return;
6509 }
6510
6511 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6512 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6513 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6514 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6515 legalizeOpWithMove(MI, Src2Idx);
6516 }
6517
6518 // VOP2 src0 instructions support all operand types, so we don't need to check
6519 // their legality. If src1 is already legal, we don't need to do anything.
6520 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6521 return;
6522
6523 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6524 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6525 // select is uniform.
6526 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6527 RI.isVGPR(MRI, Src1.getReg())) {
6528 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6529 const DebugLoc &DL = MI.getDebugLoc();
6530 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6531 .add(Src1);
6532 Src1.ChangeToRegister(Reg, false);
6533 return;
6534 }
6535
6536 // We do not use commuteInstruction here because it is too aggressive and will
6537 // commute if it is possible. We only want to commute here if it improves
6538 // legality. This can be called a fairly large number of times so don't waste
6539 // compile time pointlessly swapping and checking legality again.
6540 if (HasImplicitSGPR || !MI.isCommutable()) {
6541 legalizeOpWithMove(MI, Src1Idx);
6542 return;
6543 }
6544
6545 // If src0 can be used as src1, commuting will make the operands legal.
6546 // Otherwise we have to give up and insert a move.
6547 //
6548 // TODO: Other immediate-like operand kinds could be commuted if there was a
6549 // MachineOperand::ChangeTo* for them.
6550 if ((!Src1.isImm() && !Src1.isReg()) ||
6551 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6552 legalizeOpWithMove(MI, Src1Idx);
6553 return;
6554 }
6555
6556 int CommutedOpc = commuteOpcode(MI);
6557 if (CommutedOpc == -1) {
6558 legalizeOpWithMove(MI, Src1Idx);
6559 return;
6560 }
6561
6562 MI.setDesc(get(CommutedOpc));
6563
6564 Register Src0Reg = Src0.getReg();
6565 unsigned Src0SubReg = Src0.getSubReg();
6566 bool Src0Kill = Src0.isKill();
6567
6568 if (Src1.isImm())
6569 Src0.ChangeToImmediate(Src1.getImm());
6570 else if (Src1.isReg()) {
6571 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6572 Src0.setSubReg(Src1.getSubReg());
6573 } else
6574 llvm_unreachable("Should only have register or immediate operands");
6575
6576 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6577 Src1.setSubReg(Src0SubReg);
6579}
6580
6581// Legalize VOP3 operands. All operand types are supported for any operand
6582// but only one literal constant and only starting from GFX10.
6584 MachineInstr &MI) const {
6585 unsigned Opc = MI.getOpcode();
6586
6587 int VOP3Idx[3] = {
6588 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6589 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6590 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6591 };
6592
6593 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6594 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6595 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6596 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6597 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6598 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6599 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6600 // src1 and src2 must be scalar
6601 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6602 const DebugLoc &DL = MI.getDebugLoc();
6603 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6604 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6605 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6606 .add(Src1);
6607 Src1.ChangeToRegister(Reg, false);
6608 }
6609 if (VOP3Idx[2] != -1) {
6610 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6611 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6612 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6613 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6614 .add(Src2);
6615 Src2.ChangeToRegister(Reg, false);
6616 }
6617 }
6618 }
6619
6620 // Find the one SGPR operand we are allowed to use.
6621 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6622 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6623 SmallDenseSet<unsigned> SGPRsUsed;
6624 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6625 if (SGPRReg) {
6626 SGPRsUsed.insert(SGPRReg);
6627 --ConstantBusLimit;
6628 }
6629
6630 for (int Idx : VOP3Idx) {
6631 if (Idx == -1)
6632 break;
6633 MachineOperand &MO = MI.getOperand(Idx);
6634
6635 if (!MO.isReg()) {
6636 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6637 continue;
6638
6639 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6640 --LiteralLimit;
6641 --ConstantBusLimit;
6642 continue;
6643 }
6644
6645 --LiteralLimit;
6646 --ConstantBusLimit;
6647 legalizeOpWithMove(MI, Idx);
6648 continue;
6649 }
6650
6651 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6652 continue; // VGPRs are legal
6653
6654 // We can use one SGPR in each VOP3 instruction prior to GFX10
6655 // and two starting from GFX10.
6656 if (SGPRsUsed.count(MO.getReg()))
6657 continue;
6658 if (ConstantBusLimit > 0) {
6659 SGPRsUsed.insert(MO.getReg());
6660 --ConstantBusLimit;
6661 continue;
6662 }
6663
6664 // If we make it this far, then the operand is not legal and we must
6665 // legalize it.
6666 legalizeOpWithMove(MI, Idx);
6667 }
6668
6669 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6670 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6671 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6672 legalizeOpWithMove(MI, VOP3Idx[2]);
6673
6674 // Fix the register class of packed FP32 instructions on gfx12+. See
6675 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6677 for (unsigned I = 0; I < 3; ++I) {
6679 legalizeOpWithMove(MI, VOP3Idx[I]);
6680 }
6681 }
6682}
6683
6686 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6687 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6688 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6689 if (DstRC)
6690 SRC = RI.getCommonSubClass(SRC, DstRC);
6691
6692 Register DstReg = MRI.createVirtualRegister(SRC);
6693 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6694
6695 if (RI.hasAGPRs(VRC)) {
6696 VRC = RI.getEquivalentVGPRClass(VRC);
6697 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6698 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6699 get(TargetOpcode::COPY), NewSrcReg)
6700 .addReg(SrcReg);
6701 SrcReg = NewSrcReg;
6702 }
6703
6704 if (SubRegs == 1) {
6705 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6706 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6707 .addReg(SrcReg);
6708 return DstReg;
6709 }
6710
6712 for (unsigned i = 0; i < SubRegs; ++i) {
6713 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6714 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6715 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6716 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6717 SRegs.push_back(SGPR);
6718 }
6719
6721 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6722 get(AMDGPU::REG_SEQUENCE), DstReg);
6723 for (unsigned i = 0; i < SubRegs; ++i) {
6724 MIB.addReg(SRegs[i]);
6725 MIB.addImm(RI.getSubRegFromChannel(i));
6726 }
6727 return DstReg;
6728}
6729
6731 MachineInstr &MI) const {
6732
6733 // If the pointer is store in VGPRs, then we need to move them to
6734 // SGPRs using v_readfirstlane. This is safe because we only select
6735 // loads with uniform pointers to SMRD instruction so we know the
6736 // pointer value is uniform.
6737 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6738 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6739 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6740 SBase->setReg(SGPR);
6741 }
6742 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6743 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6744 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6745 SOff->setReg(SGPR);
6746 }
6747}
6748
6750 unsigned Opc = Inst.getOpcode();
6751 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6752 if (OldSAddrIdx < 0)
6753 return false;
6754
6755 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6756
6757 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6758 if (NewOpc < 0)
6760 if (NewOpc < 0)
6761 return false;
6762
6764 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6765 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6766 return false;
6767
6768 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6769 if (NewVAddrIdx < 0)
6770 return false;
6771
6772 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6773
6774 // Check vaddr, it shall be zero or absent.
6775 MachineInstr *VAddrDef = nullptr;
6776 if (OldVAddrIdx >= 0) {
6777 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6778 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6779 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6780 !VAddrDef->getOperand(1).isImm() ||
6781 VAddrDef->getOperand(1).getImm() != 0)
6782 return false;
6783 }
6784
6785 const MCInstrDesc &NewDesc = get(NewOpc);
6786 Inst.setDesc(NewDesc);
6787
6788 // Callers expect iterator to be valid after this call, so modify the
6789 // instruction in place.
6790 if (OldVAddrIdx == NewVAddrIdx) {
6791 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6792 // Clear use list from the old vaddr holding a zero register.
6793 MRI.removeRegOperandFromUseList(&NewVAddr);
6794 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6795 Inst.removeOperand(OldSAddrIdx);
6796 // Update the use list with the pointer we have just moved from vaddr to
6797 // saddr position. Otherwise new vaddr will be missing from the use list.
6798 MRI.removeRegOperandFromUseList(&NewVAddr);
6799 MRI.addRegOperandToUseList(&NewVAddr);
6800 } else {
6801 assert(OldSAddrIdx == NewVAddrIdx);
6802
6803 if (OldVAddrIdx >= 0) {
6804 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6805 AMDGPU::OpName::vdst_in);
6806
6807 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6808 // it asserts. Untie the operands for now and retie them afterwards.
6809 if (NewVDstIn != -1) {
6810 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6811 Inst.untieRegOperand(OldVDstIn);
6812 }
6813
6814 Inst.removeOperand(OldVAddrIdx);
6815
6816 if (NewVDstIn != -1) {
6817 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6818 Inst.tieOperands(NewVDst, NewVDstIn);
6819 }
6820 }
6821 }
6822
6823 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6824 VAddrDef->eraseFromParent();
6825
6826 return true;
6827}
6828
6829// FIXME: Remove this when SelectionDAG is obsoleted.
6831 MachineInstr &MI) const {
6832 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6833 return;
6834
6835 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6836 // thinks they are uniform, so a readfirstlane should be valid.
6837 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6838 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6839 return;
6840
6842 return;
6843
6844 const TargetRegisterClass *DeclaredRC =
6845 getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI);
6846
6847 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6848 SAddr->setReg(ToSGPR);
6849}
6850
6853 const TargetRegisterClass *DstRC,
6856 const DebugLoc &DL) const {
6857 Register OpReg = Op.getReg();
6858 unsigned OpSubReg = Op.getSubReg();
6859
6860 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6861 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6862
6863 // Check if operand is already the correct register class.
6864 if (DstRC == OpRC)
6865 return;
6866
6867 Register DstReg = MRI.createVirtualRegister(DstRC);
6868 auto Copy =
6869 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6870 Op.setReg(DstReg);
6871
6872 MachineInstr *Def = MRI.getVRegDef(OpReg);
6873 if (!Def)
6874 return;
6875
6876 // Try to eliminate the copy if it is copying an immediate value.
6877 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6878 foldImmediate(*Copy, *Def, OpReg, &MRI);
6879
6880 bool ImpDef = Def->isImplicitDef();
6881 while (!ImpDef && Def && Def->isCopy()) {
6882 if (Def->getOperand(1).getReg().isPhysical())
6883 break;
6884 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6885 ImpDef = Def && Def->isImplicitDef();
6886 }
6887 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6888 !ImpDef)
6889 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6890}
6891
6892// Emit the actual waterfall loop, executing the wrapped instruction for each
6893// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6894// iteration, in the worst case we execute 64 (once per lane).
6895static void
6898 MachineBasicBlock &LoopBB,
6899 MachineBasicBlock &BodyBB,
6900 const DebugLoc &DL,
6901 ArrayRef<MachineOperand *> ScalarOps) {
6902 MachineFunction &MF = *LoopBB.getParent();
6903 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6904 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6906 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6907
6909 Register CondReg;
6910
6911 for (MachineOperand *ScalarOp : ScalarOps) {
6912 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6913 unsigned NumSubRegs = RegSize / 32;
6914 Register VScalarOp = ScalarOp->getReg();
6915
6916 if (NumSubRegs == 1) {
6917 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6918
6919 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6920 .addReg(VScalarOp);
6921
6922 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6923
6924 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6925 .addReg(CurReg)
6926 .addReg(VScalarOp);
6927
6928 // Combine the comparison results with AND.
6929 if (!CondReg) // First.
6930 CondReg = NewCondReg;
6931 else { // If not the first, we create an AND.
6932 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6933 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
6934 .addReg(CondReg)
6935 .addReg(NewCondReg);
6936 CondReg = AndReg;
6937 }
6938
6939 // Update ScalarOp operand to use the SGPR ScalarOp.
6940 ScalarOp->setReg(CurReg);
6941 ScalarOp->setIsKill();
6942 } else {
6943 SmallVector<Register, 8> ReadlanePieces;
6944 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6945 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6946 "Unhandled register size");
6947
6948 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6949 Register CurRegLo =
6950 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6951 Register CurRegHi =
6952 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6953
6954 // Read the next variant <- also loop target.
6955 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6956 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6957
6958 // Read the next variant <- also loop target.
6959 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6960 .addReg(VScalarOp, VScalarOpUndef,
6961 TRI->getSubRegFromChannel(Idx + 1));
6962
6963 ReadlanePieces.push_back(CurRegLo);
6964 ReadlanePieces.push_back(CurRegHi);
6965
6966 // Comparison is to be done as 64-bit.
6967 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6968 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6969 .addReg(CurRegLo)
6970 .addImm(AMDGPU::sub0)
6971 .addReg(CurRegHi)
6972 .addImm(AMDGPU::sub1);
6973
6974 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6975 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6976 NewCondReg)
6977 .addReg(CurReg);
6978 if (NumSubRegs <= 2)
6979 Cmp.addReg(VScalarOp);
6980 else
6981 Cmp.addReg(VScalarOp, VScalarOpUndef,
6982 TRI->getSubRegFromChannel(Idx, 2));
6983
6984 // Combine the comparison results with AND.
6985 if (!CondReg) // First.
6986 CondReg = NewCondReg;
6987 else { // If not the first, we create an AND.
6988 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6989 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
6990 .addReg(CondReg)
6991 .addReg(NewCondReg);
6992 CondReg = AndReg;
6993 }
6994 } // End for loop.
6995
6996 const auto *SScalarOpRC =
6997 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6998 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6999
7000 // Build scalar ScalarOp.
7001 auto Merge =
7002 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7003 unsigned Channel = 0;
7004 for (Register Piece : ReadlanePieces) {
7005 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7006 }
7007
7008 // Update ScalarOp operand to use the SGPR ScalarOp.
7009 ScalarOp->setReg(SScalarOp);
7010 ScalarOp->setIsKill();
7011 }
7012 }
7013
7014 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7015 MRI.setSimpleHint(SaveExec, CondReg);
7016
7017 // Update EXEC to matching lanes, saving original to SaveExec.
7018 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7019 .addReg(CondReg, RegState::Kill);
7020
7021 // The original instruction is here; we insert the terminators after it.
7022 I = BodyBB.end();
7023
7024 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7025 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7026 .addReg(LMC.ExecReg)
7027 .addReg(SaveExec);
7028
7029 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7030}
7031
7032// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7033// with SGPRs by iterating over all unique values across all lanes.
7034// Returns the loop basic block that now contains \p MI.
7035static MachineBasicBlock *
7039 MachineBasicBlock::iterator Begin = nullptr,
7040 MachineBasicBlock::iterator End = nullptr) {
7041 MachineBasicBlock &MBB = *MI.getParent();
7042 MachineFunction &MF = *MBB.getParent();
7043 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7044 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7046 if (!Begin.isValid())
7047 Begin = &MI;
7048 if (!End.isValid()) {
7049 End = &MI;
7050 ++End;
7051 }
7052 const DebugLoc &DL = MI.getDebugLoc();
7054 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7055
7056 // Save SCC. Waterfall Loop may overwrite SCC.
7057 Register SaveSCCReg;
7058
7059 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7060 // rather than unlimited scan everywhere
7061 bool SCCNotDead =
7062 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7063 std::numeric_limits<unsigned>::max()) !=
7065 if (SCCNotDead) {
7066 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7067 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7068 .addImm(1)
7069 .addImm(0);
7070 }
7071
7072 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7073
7074 // Save the EXEC mask
7075 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7076
7077 // Killed uses in the instruction we are waterfalling around will be
7078 // incorrect due to the added control-flow.
7080 ++AfterMI;
7081 for (auto I = Begin; I != AfterMI; I++) {
7082 for (auto &MO : I->all_uses())
7083 MRI.clearKillFlags(MO.getReg());
7084 }
7085
7086 // To insert the loop we need to split the block. Move everything after this
7087 // point to a new block, and insert a new empty block between the two.
7090 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7092 ++MBBI;
7093
7094 MF.insert(MBBI, LoopBB);
7095 MF.insert(MBBI, BodyBB);
7096 MF.insert(MBBI, RemainderBB);
7097
7098 LoopBB->addSuccessor(BodyBB);
7099 BodyBB->addSuccessor(LoopBB);
7100 BodyBB->addSuccessor(RemainderBB);
7101
7102 // Move Begin to MI to the BodyBB, and the remainder of the block to
7103 // RemainderBB.
7104 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7105 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7106 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7107
7108 MBB.addSuccessor(LoopBB);
7109
7110 // Update dominators. We know that MBB immediately dominates LoopBB, that
7111 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7112 // RemainderBB. RemainderBB immediately dominates all of the successors
7113 // transferred to it from MBB that MBB used to properly dominate.
7114 if (MDT) {
7115 MDT->addNewBlock(LoopBB, &MBB);
7116 MDT->addNewBlock(BodyBB, LoopBB);
7117 MDT->addNewBlock(RemainderBB, BodyBB);
7118 for (auto &Succ : RemainderBB->successors()) {
7119 if (MDT->properlyDominates(&MBB, Succ)) {
7120 MDT->changeImmediateDominator(Succ, RemainderBB);
7121 }
7122 }
7123 }
7124
7125 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7126
7127 MachineBasicBlock::iterator First = RemainderBB->begin();
7128 // Restore SCC
7129 if (SCCNotDead) {
7130 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7131 .addReg(SaveSCCReg, RegState::Kill)
7132 .addImm(0);
7133 }
7134
7135 // Restore the EXEC mask
7136 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7137 .addReg(SaveExec);
7138 return BodyBB;
7139}
7140
7141// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7142static std::tuple<unsigned, unsigned>
7144 MachineBasicBlock &MBB = *MI.getParent();
7145 MachineFunction &MF = *MBB.getParent();
7147
7148 // Extract the ptr from the resource descriptor.
7149 unsigned RsrcPtr =
7150 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7151 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7152
7153 // Create an empty resource descriptor
7154 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7155 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7156 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7157 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7158 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7159
7160 // Zero64 = 0
7161 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7162 .addImm(0);
7163
7164 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7165 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7166 .addImm(Lo_32(RsrcDataFormat));
7167
7168 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7169 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7170 .addImm(Hi_32(RsrcDataFormat));
7171
7172 // NewSRsrc = {Zero64, SRsrcFormat}
7173 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7174 .addReg(Zero64)
7175 .addImm(AMDGPU::sub0_sub1)
7176 .addReg(SRsrcFormatLo)
7177 .addImm(AMDGPU::sub2)
7178 .addReg(SRsrcFormatHi)
7179 .addImm(AMDGPU::sub3);
7180
7181 return std::tuple(RsrcPtr, NewSRsrc);
7182}
7183
7186 MachineDominatorTree *MDT) const {
7187 MachineFunction &MF = *MI.getParent()->getParent();
7189 MachineBasicBlock *CreatedBB = nullptr;
7190
7191 // Legalize VOP2
7192 if (isVOP2(MI) || isVOPC(MI)) {
7194 return CreatedBB;
7195 }
7196
7197 // Legalize VOP3
7198 if (isVOP3(MI)) {
7200 return CreatedBB;
7201 }
7202
7203 // Legalize SMRD
7204 if (isSMRD(MI)) {
7206 return CreatedBB;
7207 }
7208
7209 // Legalize FLAT
7210 if (isFLAT(MI)) {
7212 return CreatedBB;
7213 }
7214
7215 // Legalize REG_SEQUENCE and PHI
7216 // The register class of the operands much be the same type as the register
7217 // class of the output.
7218 if (MI.getOpcode() == AMDGPU::PHI) {
7219 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
7220 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
7221 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
7222 continue;
7223 const TargetRegisterClass *OpRC =
7224 MRI.getRegClass(MI.getOperand(i).getReg());
7225 if (RI.hasVectorRegisters(OpRC)) {
7226 VRC = OpRC;
7227 } else {
7228 SRC = OpRC;
7229 }
7230 }
7231
7232 // If any of the operands are VGPR registers, then they all most be
7233 // otherwise we will create illegal VGPR->SGPR copies when legalizing
7234 // them.
7235 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
7236 if (!VRC) {
7237 assert(SRC);
7238 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
7239 VRC = &AMDGPU::VReg_1RegClass;
7240 } else
7241 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7242 ? RI.getEquivalentAGPRClass(SRC)
7243 : RI.getEquivalentVGPRClass(SRC);
7244 } else {
7245 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7246 ? RI.getEquivalentAGPRClass(VRC)
7247 : RI.getEquivalentVGPRClass(VRC);
7248 }
7249 RC = VRC;
7250 } else {
7251 RC = SRC;
7252 }
7253
7254 // Update all the operands so they have the same type.
7255 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7256 MachineOperand &Op = MI.getOperand(I);
7257 if (!Op.isReg() || !Op.getReg().isVirtual())
7258 continue;
7259
7260 // MI is a PHI instruction.
7261 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7263
7264 // Avoid creating no-op copies with the same src and dst reg class. These
7265 // confuse some of the machine passes.
7266 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
7267 }
7268 }
7269
7270 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7271 // VGPR dest type and SGPR sources, insert copies so all operands are
7272 // VGPRs. This seems to help operand folding / the register coalescer.
7273 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7274 MachineBasicBlock *MBB = MI.getParent();
7275 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7276 if (RI.hasVGPRs(DstRC)) {
7277 // Update all the operands so they are VGPR register classes. These may
7278 // not be the same register class because REG_SEQUENCE supports mixing
7279 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7280 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7281 MachineOperand &Op = MI.getOperand(I);
7282 if (!Op.isReg() || !Op.getReg().isVirtual())
7283 continue;
7284
7285 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7286 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7287 if (VRC == OpRC)
7288 continue;
7289
7290 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7291 Op.setIsKill();
7292 }
7293 }
7294
7295 return CreatedBB;
7296 }
7297
7298 // Legalize INSERT_SUBREG
7299 // src0 must have the same register class as dst
7300 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7301 Register Dst = MI.getOperand(0).getReg();
7302 Register Src0 = MI.getOperand(1).getReg();
7303 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7304 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7305 if (DstRC != Src0RC) {
7306 MachineBasicBlock *MBB = MI.getParent();
7307 MachineOperand &Op = MI.getOperand(1);
7308 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7309 }
7310 return CreatedBB;
7311 }
7312
7313 // Legalize SI_INIT_M0
7314 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7315 MachineOperand &Src = MI.getOperand(0);
7316 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7317 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7318 return CreatedBB;
7319 }
7320
7321 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7322 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7323 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7324 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7325 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7326 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7327 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7328 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7329 MachineOperand &Src = MI.getOperand(1);
7330 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7331 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7332 return CreatedBB;
7333 }
7334
7335 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7336 //
7337 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7338 // scratch memory access. In both cases, the legalization never involves
7339 // conversion to the addr64 form.
7341 (isMUBUF(MI) || isMTBUF(MI)))) {
7342 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7343 ? AMDGPU::OpName::rsrc
7344 : AMDGPU::OpName::srsrc;
7345 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7346 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7347 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7348
7349 AMDGPU::OpName SampOpName =
7350 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7351 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7352 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7353 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7354
7355 return CreatedBB;
7356 }
7357
7358 // Legalize SI_CALL
7359 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7360 MachineOperand *Dest = &MI.getOperand(0);
7361 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7362 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7363 // following copies, we also need to move copies from and to physical
7364 // registers into the loop block.
7365 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7366 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7367
7368 // Also move the copies to physical registers into the loop block
7369 MachineBasicBlock &MBB = *MI.getParent();
7371 while (Start->getOpcode() != FrameSetupOpcode)
7372 --Start;
7374 while (End->getOpcode() != FrameDestroyOpcode)
7375 ++End;
7376 // Also include following copies of the return value
7377 ++End;
7378 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7379 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7380 ++End;
7381 CreatedBB =
7382 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7383 }
7384 }
7385
7386 // Legalize s_sleep_var.
7387 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7388 const DebugLoc &DL = MI.getDebugLoc();
7389 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7390 int Src0Idx =
7391 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7392 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7393 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7394 .add(Src0);
7395 Src0.ChangeToRegister(Reg, false);
7396 return nullptr;
7397 }
7398
7399 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7400 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7401 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7402 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7403 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7404 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7405 for (MachineOperand &Src : MI.explicit_operands()) {
7406 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7407 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7408 }
7409 return CreatedBB;
7410 }
7411
7412 // Legalize MUBUF instructions.
7413 bool isSoffsetLegal = true;
7414 int SoffsetIdx =
7415 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7416 if (SoffsetIdx != -1) {
7417 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7418 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7419 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7420 isSoffsetLegal = false;
7421 }
7422 }
7423
7424 bool isRsrcLegal = true;
7425 int RsrcIdx =
7426 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7427 if (RsrcIdx != -1) {
7428 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7429 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7430 isRsrcLegal = false;
7431 }
7432
7433 // The operands are legal.
7434 if (isRsrcLegal && isSoffsetLegal)
7435 return CreatedBB;
7436
7437 if (!isRsrcLegal) {
7438 // Legalize a VGPR Rsrc
7439 //
7440 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7441 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7442 // a zero-value SRsrc.
7443 //
7444 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7445 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7446 // above.
7447 //
7448 // Otherwise we are on non-ADDR64 hardware, and/or we have
7449 // idxen/offen/bothen and we fall back to a waterfall loop.
7450
7451 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7452 MachineBasicBlock &MBB = *MI.getParent();
7453
7454 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7455 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7456 // This is already an ADDR64 instruction so we need to add the pointer
7457 // extracted from the resource descriptor to the current value of VAddr.
7458 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7459 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7460 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7461
7462 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7463 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7464 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7465
7466 unsigned RsrcPtr, NewSRsrc;
7467 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7468
7469 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7470 const DebugLoc &DL = MI.getDebugLoc();
7471 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7472 .addDef(CondReg0)
7473 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7474 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7475 .addImm(0);
7476
7477 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7478 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7479 .addDef(CondReg1, RegState::Dead)
7480 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7481 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7482 .addReg(CondReg0, RegState::Kill)
7483 .addImm(0);
7484
7485 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7486 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7487 .addReg(NewVAddrLo)
7488 .addImm(AMDGPU::sub0)
7489 .addReg(NewVAddrHi)
7490 .addImm(AMDGPU::sub1);
7491
7492 VAddr->setReg(NewVAddr);
7493 Rsrc->setReg(NewSRsrc);
7494 } else if (!VAddr && ST.hasAddr64()) {
7495 // This instructions is the _OFFSET variant, so we need to convert it to
7496 // ADDR64.
7497 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7498 "FIXME: Need to emit flat atomics here");
7499
7500 unsigned RsrcPtr, NewSRsrc;
7501 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7502
7503 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7504 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7505 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7506 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7507 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7508
7509 // Atomics with return have an additional tied operand and are
7510 // missing some of the special bits.
7511 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7512 MachineInstr *Addr64;
7513
7514 if (!VDataIn) {
7515 // Regular buffer load / store.
7517 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7518 .add(*VData)
7519 .addReg(NewVAddr)
7520 .addReg(NewSRsrc)
7521 .add(*SOffset)
7522 .add(*Offset);
7523
7524 if (const MachineOperand *CPol =
7525 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7526 MIB.addImm(CPol->getImm());
7527 }
7528
7529 if (const MachineOperand *TFE =
7530 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7531 MIB.addImm(TFE->getImm());
7532 }
7533
7534 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7535
7536 MIB.cloneMemRefs(MI);
7537 Addr64 = MIB;
7538 } else {
7539 // Atomics with return.
7540 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7541 .add(*VData)
7542 .add(*VDataIn)
7543 .addReg(NewVAddr)
7544 .addReg(NewSRsrc)
7545 .add(*SOffset)
7546 .add(*Offset)
7547 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7548 .cloneMemRefs(MI);
7549 }
7550
7551 MI.removeFromParent();
7552
7553 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7554 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7555 NewVAddr)
7556 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7557 .addImm(AMDGPU::sub0)
7558 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7559 .addImm(AMDGPU::sub1);
7560 } else {
7561 // Legalize a VGPR Rsrc and soffset together.
7562 if (!isSoffsetLegal) {
7563 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7564 CreatedBB =
7565 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7566 return CreatedBB;
7567 }
7568 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7569 return CreatedBB;
7570 }
7571 }
7572
7573 // Legalize a VGPR soffset.
7574 if (!isSoffsetLegal) {
7575 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7576 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7577 return CreatedBB;
7578 }
7579 return CreatedBB;
7580}
7581
7583 InstrList.insert(MI);
7584 // Add MBUF instructiosn to deferred list.
7585 int RsrcIdx =
7586 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7587 if (RsrcIdx != -1) {
7588 DeferredList.insert(MI);
7589 }
7590}
7591
7593 return DeferredList.contains(MI);
7594}
7595
7596// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7597// lowering (change spgr to vgpr).
7598// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7599// size. Need to legalize the size of the operands during the vgpr lowering
7600// chain. This can be removed after we have sgpr16 in place
7602 MachineRegisterInfo &MRI) const {
7603 if (!ST.useRealTrue16Insts())
7604 return;
7605
7606 unsigned Opcode = MI.getOpcode();
7607 MachineBasicBlock *MBB = MI.getParent();
7608 // Legalize operands and check for size mismatch
7609 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7610 OpIdx >= get(Opcode).getNumOperands() ||
7611 get(Opcode).operands()[OpIdx].RegClass == -1)
7612 return;
7613
7614 MachineOperand &Op = MI.getOperand(OpIdx);
7615 if (!Op.isReg() || !Op.getReg().isVirtual())
7616 return;
7617
7618 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7619 if (!RI.isVGPRClass(CurrRC))
7620 return;
7621
7622 unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
7623 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7624 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7625 Op.setSubReg(AMDGPU::lo16);
7626 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7627 const DebugLoc &DL = MI.getDebugLoc();
7628 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7629 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7630 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7631 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7632 .addReg(Op.getReg())
7633 .addImm(AMDGPU::lo16)
7634 .addReg(Undef)
7635 .addImm(AMDGPU::hi16);
7636 Op.setReg(NewDstReg);
7637 }
7638}
7640 MachineRegisterInfo &MRI) const {
7641 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7643}
7644
7646 MachineDominatorTree *MDT) const {
7647
7648 while (!Worklist.empty()) {
7649 MachineInstr &Inst = *Worklist.top();
7650 Worklist.erase_top();
7651 // Skip MachineInstr in the deferred list.
7652 if (Worklist.isDeferred(&Inst))
7653 continue;
7654 moveToVALUImpl(Worklist, MDT, Inst);
7655 }
7656
7657 // Deferred list of instructions will be processed once
7658 // all the MachineInstr in the worklist are done.
7659 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7660 moveToVALUImpl(Worklist, MDT, *Inst);
7661 assert(Worklist.empty() &&
7662 "Deferred MachineInstr are not supposed to re-populate worklist");
7663 }
7664}
7665
7668 MachineInstr &Inst) const {
7669
7671 if (!MBB)
7672 return;
7673 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7674 unsigned Opcode = Inst.getOpcode();
7675 unsigned NewOpcode = getVALUOp(Inst);
7676 // Handle some special cases
7677 switch (Opcode) {
7678 default:
7679 break;
7680 case AMDGPU::S_ADD_I32:
7681 case AMDGPU::S_SUB_I32: {
7682 // FIXME: The u32 versions currently selected use the carry.
7683 bool Changed;
7684 MachineBasicBlock *CreatedBBTmp = nullptr;
7685 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7686 if (Changed)
7687 return;
7688
7689 // Default handling
7690 break;
7691 }
7692
7693 case AMDGPU::S_MUL_U64:
7694 if (ST.hasVectorMulU64()) {
7695 NewOpcode = AMDGPU::V_MUL_U64_e64;
7696 break;
7697 }
7698 // Split s_mul_u64 in 32-bit vector multiplications.
7699 splitScalarSMulU64(Worklist, Inst, MDT);
7700 Inst.eraseFromParent();
7701 return;
7702
7703 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7704 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7705 // This is a special case of s_mul_u64 where all the operands are either
7706 // zero extended or sign extended.
7707 splitScalarSMulPseudo(Worklist, Inst, MDT);
7708 Inst.eraseFromParent();
7709 return;
7710
7711 case AMDGPU::S_AND_B64:
7712 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7713 Inst.eraseFromParent();
7714 return;
7715
7716 case AMDGPU::S_OR_B64:
7717 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7718 Inst.eraseFromParent();
7719 return;
7720
7721 case AMDGPU::S_XOR_B64:
7722 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7723 Inst.eraseFromParent();
7724 return;
7725
7726 case AMDGPU::S_NAND_B64:
7727 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7728 Inst.eraseFromParent();
7729 return;
7730
7731 case AMDGPU::S_NOR_B64:
7732 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7733 Inst.eraseFromParent();
7734 return;
7735
7736 case AMDGPU::S_XNOR_B64:
7737 if (ST.hasDLInsts())
7738 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7739 else
7740 splitScalar64BitXnor(Worklist, Inst, MDT);
7741 Inst.eraseFromParent();
7742 return;
7743
7744 case AMDGPU::S_ANDN2_B64:
7745 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7746 Inst.eraseFromParent();
7747 return;
7748
7749 case AMDGPU::S_ORN2_B64:
7750 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7751 Inst.eraseFromParent();
7752 return;
7753
7754 case AMDGPU::S_BREV_B64:
7755 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7756 Inst.eraseFromParent();
7757 return;
7758
7759 case AMDGPU::S_NOT_B64:
7760 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7761 Inst.eraseFromParent();
7762 return;
7763
7764 case AMDGPU::S_BCNT1_I32_B64:
7765 splitScalar64BitBCNT(Worklist, Inst);
7766 Inst.eraseFromParent();
7767 return;
7768
7769 case AMDGPU::S_BFE_I64:
7770 splitScalar64BitBFE(Worklist, Inst);
7771 Inst.eraseFromParent();
7772 return;
7773
7774 case AMDGPU::S_FLBIT_I32_B64:
7775 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7776 Inst.eraseFromParent();
7777 return;
7778 case AMDGPU::S_FF1_I32_B64:
7779 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7780 Inst.eraseFromParent();
7781 return;
7782
7783 case AMDGPU::S_LSHL_B32:
7784 if (ST.hasOnlyRevVALUShifts()) {
7785 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7786 swapOperands(Inst);
7787 }
7788 break;
7789 case AMDGPU::S_ASHR_I32:
7790 if (ST.hasOnlyRevVALUShifts()) {
7791 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7792 swapOperands(Inst);
7793 }
7794 break;
7795 case AMDGPU::S_LSHR_B32:
7796 if (ST.hasOnlyRevVALUShifts()) {
7797 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7798 swapOperands(Inst);
7799 }
7800 break;
7801 case AMDGPU::S_LSHL_B64:
7802 if (ST.hasOnlyRevVALUShifts()) {
7803 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7804 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7805 : AMDGPU::V_LSHLREV_B64_e64;
7806 swapOperands(Inst);
7807 }
7808 break;
7809 case AMDGPU::S_ASHR_I64:
7810 if (ST.hasOnlyRevVALUShifts()) {
7811 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7812 swapOperands(Inst);
7813 }
7814 break;
7815 case AMDGPU::S_LSHR_B64:
7816 if (ST.hasOnlyRevVALUShifts()) {
7817 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7818 swapOperands(Inst);
7819 }
7820 break;
7821
7822 case AMDGPU::S_ABS_I32:
7823 lowerScalarAbs(Worklist, Inst);
7824 Inst.eraseFromParent();
7825 return;
7826
7827 case AMDGPU::S_CBRANCH_SCC0:
7828 case AMDGPU::S_CBRANCH_SCC1: {
7829 // Clear unused bits of vcc
7830 Register CondReg = Inst.getOperand(1).getReg();
7831 bool IsSCC = CondReg == AMDGPU::SCC;
7833 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
7834 .addReg(LMC.ExecReg)
7835 .addReg(IsSCC ? LMC.VccReg : CondReg);
7836 Inst.removeOperand(1);
7837 } break;
7838
7839 case AMDGPU::S_BFE_U64:
7840 case AMDGPU::S_BFM_B64:
7841 llvm_unreachable("Moving this op to VALU not implemented");
7842
7843 case AMDGPU::S_PACK_LL_B32_B16:
7844 case AMDGPU::S_PACK_LH_B32_B16:
7845 case AMDGPU::S_PACK_HL_B32_B16:
7846 case AMDGPU::S_PACK_HH_B32_B16:
7847 movePackToVALU(Worklist, MRI, Inst);
7848 Inst.eraseFromParent();
7849 return;
7850
7851 case AMDGPU::S_XNOR_B32:
7852 lowerScalarXnor(Worklist, Inst);
7853 Inst.eraseFromParent();
7854 return;
7855
7856 case AMDGPU::S_NAND_B32:
7857 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7858 Inst.eraseFromParent();
7859 return;
7860
7861 case AMDGPU::S_NOR_B32:
7862 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7863 Inst.eraseFromParent();
7864 return;
7865
7866 case AMDGPU::S_ANDN2_B32:
7867 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7868 Inst.eraseFromParent();
7869 return;
7870
7871 case AMDGPU::S_ORN2_B32:
7872 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7873 Inst.eraseFromParent();
7874 return;
7875
7876 // TODO: remove as soon as everything is ready
7877 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7878 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7879 // can only be selected from the uniform SDNode.
7880 case AMDGPU::S_ADD_CO_PSEUDO:
7881 case AMDGPU::S_SUB_CO_PSEUDO: {
7882 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7883 ? AMDGPU::V_ADDC_U32_e64
7884 : AMDGPU::V_SUBB_U32_e64;
7885 const auto *CarryRC = RI.getWaveMaskRegClass();
7886
7887 Register CarryInReg = Inst.getOperand(4).getReg();
7888 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7889 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7890 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7891 .addReg(CarryInReg);
7892 }
7893
7894 Register CarryOutReg = Inst.getOperand(1).getReg();
7895
7896 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7897 MRI.getRegClass(Inst.getOperand(0).getReg())));
7898 MachineInstr *CarryOp =
7899 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7900 .addReg(CarryOutReg, RegState::Define)
7901 .add(Inst.getOperand(2))
7902 .add(Inst.getOperand(3))
7903 .addReg(CarryInReg)
7904 .addImm(0);
7905 legalizeOperands(*CarryOp);
7906 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7907 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7908 Inst.eraseFromParent();
7909 }
7910 return;
7911 case AMDGPU::S_UADDO_PSEUDO:
7912 case AMDGPU::S_USUBO_PSEUDO: {
7913 const DebugLoc &DL = Inst.getDebugLoc();
7914 MachineOperand &Dest0 = Inst.getOperand(0);
7915 MachineOperand &Dest1 = Inst.getOperand(1);
7916 MachineOperand &Src0 = Inst.getOperand(2);
7917 MachineOperand &Src1 = Inst.getOperand(3);
7918
7919 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7920 ? AMDGPU::V_ADD_CO_U32_e64
7921 : AMDGPU::V_SUB_CO_U32_e64;
7922 const TargetRegisterClass *NewRC =
7923 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7924 Register DestReg = MRI.createVirtualRegister(NewRC);
7925 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7926 .addReg(Dest1.getReg(), RegState::Define)
7927 .add(Src0)
7928 .add(Src1)
7929 .addImm(0); // clamp bit
7930
7931 legalizeOperands(*NewInstr, MDT);
7932 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7933 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7934 Worklist);
7935 Inst.eraseFromParent();
7936 }
7937 return;
7938
7939 case AMDGPU::S_CSELECT_B32:
7940 case AMDGPU::S_CSELECT_B64:
7941 lowerSelect(Worklist, Inst, MDT);
7942 Inst.eraseFromParent();
7943 return;
7944 case AMDGPU::S_CMP_EQ_I32:
7945 case AMDGPU::S_CMP_LG_I32:
7946 case AMDGPU::S_CMP_GT_I32:
7947 case AMDGPU::S_CMP_GE_I32:
7948 case AMDGPU::S_CMP_LT_I32:
7949 case AMDGPU::S_CMP_LE_I32:
7950 case AMDGPU::S_CMP_EQ_U32:
7951 case AMDGPU::S_CMP_LG_U32:
7952 case AMDGPU::S_CMP_GT_U32:
7953 case AMDGPU::S_CMP_GE_U32:
7954 case AMDGPU::S_CMP_LT_U32:
7955 case AMDGPU::S_CMP_LE_U32:
7956 case AMDGPU::S_CMP_EQ_U64:
7957 case AMDGPU::S_CMP_LG_U64:
7958 case AMDGPU::S_CMP_LT_F32:
7959 case AMDGPU::S_CMP_EQ_F32:
7960 case AMDGPU::S_CMP_LE_F32:
7961 case AMDGPU::S_CMP_GT_F32:
7962 case AMDGPU::S_CMP_LG_F32:
7963 case AMDGPU::S_CMP_GE_F32:
7964 case AMDGPU::S_CMP_O_F32:
7965 case AMDGPU::S_CMP_U_F32:
7966 case AMDGPU::S_CMP_NGE_F32:
7967 case AMDGPU::S_CMP_NLG_F32:
7968 case AMDGPU::S_CMP_NGT_F32:
7969 case AMDGPU::S_CMP_NLE_F32:
7970 case AMDGPU::S_CMP_NEQ_F32:
7971 case AMDGPU::S_CMP_NLT_F32: {
7972 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7973 auto NewInstr =
7974 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7975 .setMIFlags(Inst.getFlags());
7976 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
7977 0) {
7978 NewInstr
7979 .addImm(0) // src0_modifiers
7980 .add(Inst.getOperand(0)) // src0
7981 .addImm(0) // src1_modifiers
7982 .add(Inst.getOperand(1)) // src1
7983 .addImm(0); // clamp
7984 } else {
7985 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
7986 }
7987 legalizeOperands(*NewInstr, MDT);
7988 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7989 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7990 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7991 Inst.eraseFromParent();
7992 return;
7993 }
7994 case AMDGPU::S_CMP_LT_F16:
7995 case AMDGPU::S_CMP_EQ_F16:
7996 case AMDGPU::S_CMP_LE_F16:
7997 case AMDGPU::S_CMP_GT_F16:
7998 case AMDGPU::S_CMP_LG_F16:
7999 case AMDGPU::S_CMP_GE_F16:
8000 case AMDGPU::S_CMP_O_F16:
8001 case AMDGPU::S_CMP_U_F16:
8002 case AMDGPU::S_CMP_NGE_F16:
8003 case AMDGPU::S_CMP_NLG_F16:
8004 case AMDGPU::S_CMP_NGT_F16:
8005 case AMDGPU::S_CMP_NLE_F16:
8006 case AMDGPU::S_CMP_NEQ_F16:
8007 case AMDGPU::S_CMP_NLT_F16: {
8008 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8009 auto NewInstr =
8010 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8011 .setMIFlags(Inst.getFlags());
8012 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8013 NewInstr
8014 .addImm(0) // src0_modifiers
8015 .add(Inst.getOperand(0)) // src0
8016 .addImm(0) // src1_modifiers
8017 .add(Inst.getOperand(1)) // src1
8018 .addImm(0); // clamp
8019 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8020 NewInstr.addImm(0); // op_sel0
8021 } else {
8022 NewInstr
8023 .add(Inst.getOperand(0))
8024 .add(Inst.getOperand(1));
8025 }
8026 legalizeOperandsVALUt16(*NewInstr, MRI);
8027 legalizeOperands(*NewInstr, MDT);
8028 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8029 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
8030 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8031 Inst.eraseFromParent();
8032 return;
8033 }
8034 case AMDGPU::S_CVT_HI_F32_F16: {
8035 const DebugLoc &DL = Inst.getDebugLoc();
8036 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8037 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8038 if (ST.useRealTrue16Insts()) {
8039 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8040 .add(Inst.getOperand(1));
8041 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8042 .addImm(0) // src0_modifiers
8043 .addReg(TmpReg, 0, AMDGPU::hi16)
8044 .addImm(0) // clamp
8045 .addImm(0) // omod
8046 .addImm(0); // op_sel0
8047 } else {
8048 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8049 .addImm(16)
8050 .add(Inst.getOperand(1));
8051 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8052 .addImm(0) // src0_modifiers
8053 .addReg(TmpReg)
8054 .addImm(0) // clamp
8055 .addImm(0); // omod
8056 }
8057
8058 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8059 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8060 Inst.eraseFromParent();
8061 return;
8062 }
8063 case AMDGPU::S_MINIMUM_F32:
8064 case AMDGPU::S_MAXIMUM_F32: {
8065 const DebugLoc &DL = Inst.getDebugLoc();
8066 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8067 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8068 .addImm(0) // src0_modifiers
8069 .add(Inst.getOperand(1))
8070 .addImm(0) // src1_modifiers
8071 .add(Inst.getOperand(2))
8072 .addImm(0) // clamp
8073 .addImm(0); // omod
8074 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8075
8076 legalizeOperands(*NewInstr, MDT);
8077 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8078 Inst.eraseFromParent();
8079 return;
8080 }
8081 case AMDGPU::S_MINIMUM_F16:
8082 case AMDGPU::S_MAXIMUM_F16: {
8083 const DebugLoc &DL = Inst.getDebugLoc();
8084 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8085 ? &AMDGPU::VGPR_16RegClass
8086 : &AMDGPU::VGPR_32RegClass);
8087 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8088 .addImm(0) // src0_modifiers
8089 .add(Inst.getOperand(1))
8090 .addImm(0) // src1_modifiers
8091 .add(Inst.getOperand(2))
8092 .addImm(0) // clamp
8093 .addImm(0) // omod
8094 .addImm(0); // opsel0
8095 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8096 legalizeOperandsVALUt16(*NewInstr, MRI);
8097 legalizeOperands(*NewInstr, MDT);
8098 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8099 Inst.eraseFromParent();
8100 return;
8101 }
8102 case AMDGPU::V_S_EXP_F16_e64:
8103 case AMDGPU::V_S_LOG_F16_e64:
8104 case AMDGPU::V_S_RCP_F16_e64:
8105 case AMDGPU::V_S_RSQ_F16_e64:
8106 case AMDGPU::V_S_SQRT_F16_e64: {
8107 const DebugLoc &DL = Inst.getDebugLoc();
8108 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8109 ? &AMDGPU::VGPR_16RegClass
8110 : &AMDGPU::VGPR_32RegClass);
8111 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8112 .add(Inst.getOperand(1)) // src0_modifiers
8113 .add(Inst.getOperand(2))
8114 .add(Inst.getOperand(3)) // clamp
8115 .add(Inst.getOperand(4)) // omod
8116 .setMIFlags(Inst.getFlags());
8117 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8118 NewInstr.addImm(0); // opsel0
8119 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8120 legalizeOperandsVALUt16(*NewInstr, MRI);
8121 legalizeOperands(*NewInstr, MDT);
8122 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8123 Inst.eraseFromParent();
8124 return;
8125 }
8126 }
8127
8128 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8129 // We cannot move this instruction to the VALU, so we should try to
8130 // legalize its operands instead.
8131 legalizeOperands(Inst, MDT);
8132 return;
8133 }
8134 // Handle converting generic instructions like COPY-to-SGPR into
8135 // COPY-to-VGPR.
8136 if (NewOpcode == Opcode) {
8137 Register DstReg = Inst.getOperand(0).getReg();
8138 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8139
8140 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8141 // hope for the best.
8142 if (Inst.isCopy() && DstReg.isPhysical() &&
8143 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8144 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8145 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8146 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8147 .add(Inst.getOperand(1));
8148 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8149 DstReg)
8150 .addReg(NewDst);
8151
8152 Inst.eraseFromParent();
8153 return;
8154 }
8155
8156 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
8157 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
8158 // Instead of creating a copy where src and dst are the same register
8159 // class, we just replace all uses of dst with src. These kinds of
8160 // copies interfere with the heuristics MachineSink uses to decide
8161 // whether or not to split a critical edge. Since the pass assumes
8162 // that copies will end up as machine instructions and not be
8163 // eliminated.
8164 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8165 Register NewDstReg = Inst.getOperand(1).getReg();
8166 MRI.replaceRegWith(DstReg, NewDstReg);
8167 MRI.clearKillFlags(NewDstReg);
8168 Inst.getOperand(0).setReg(DstReg);
8169 Inst.eraseFromParent();
8170 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8171 for (MachineOperand &MO :
8172 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8173 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8174 }
8175 return;
8176 }
8177
8178 // If this is a v2s copy between 16bit and 32bit reg,
8179 // replace vgpr copy to reg_sequence/extract_subreg
8180 // This can be remove after we have sgpr16 in place
8181 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8182 Inst.getOperand(1).getReg().isVirtual() &&
8183 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8184 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8185 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8186 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8187 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8188 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8189 get(AMDGPU::IMPLICIT_DEF), Undef);
8190 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8191 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8192 .addReg(Inst.getOperand(1).getReg())
8193 .addImm(AMDGPU::lo16)
8194 .addReg(Undef)
8195 .addImm(AMDGPU::hi16);
8196 Inst.eraseFromParent();
8197 MRI.replaceRegWith(DstReg, NewDstReg);
8198 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8199 return;
8200 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8201 AMDGPU::lo16)) {
8202 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8203 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8204 MRI.replaceRegWith(DstReg, NewDstReg);
8205 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8206 return;
8207 }
8208 }
8209
8210 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8211 MRI.replaceRegWith(DstReg, NewDstReg);
8212 legalizeOperands(Inst, MDT);
8213 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8214 return;
8215 }
8216
8217 // Use the new VALU Opcode.
8218 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8219 .setMIFlags(Inst.getFlags());
8220 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8221 // Intersperse VOP3 modifiers among the SALU operands.
8222 NewInstr->addOperand(Inst.getOperand(0));
8223 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8224 AMDGPU::OpName::src0_modifiers) >= 0)
8225 NewInstr.addImm(0);
8226 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8227 MachineOperand Src = Inst.getOperand(1);
8228 NewInstr->addOperand(Src);
8229 }
8230
8231 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8232 // We are converting these to a BFE, so we need to add the missing
8233 // operands for the size and offset.
8234 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8235 NewInstr.addImm(0);
8236 NewInstr.addImm(Size);
8237 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8238 // The VALU version adds the second operand to the result, so insert an
8239 // extra 0 operand.
8240 NewInstr.addImm(0);
8241 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8242 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8243 // If we need to move this to VGPRs, we need to unpack the second
8244 // operand back into the 2 separate ones for bit offset and width.
8245 assert(OffsetWidthOp.isImm() &&
8246 "Scalar BFE is only implemented for constant width and offset");
8247 uint32_t Imm = OffsetWidthOp.getImm();
8248
8249 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8250 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8251 NewInstr.addImm(Offset);
8252 NewInstr.addImm(BitWidth);
8253 } else {
8254 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8255 AMDGPU::OpName::src1_modifiers) >= 0)
8256 NewInstr.addImm(0);
8257 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8258 NewInstr->addOperand(Inst.getOperand(2));
8259 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8260 AMDGPU::OpName::src2_modifiers) >= 0)
8261 NewInstr.addImm(0);
8262 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8263 NewInstr->addOperand(Inst.getOperand(3));
8264 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8265 NewInstr.addImm(0);
8266 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8267 NewInstr.addImm(0);
8268 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8269 NewInstr.addImm(0);
8270 }
8271 } else {
8272 // Just copy the SALU operands.
8273 for (const MachineOperand &Op : Inst.explicit_operands())
8274 NewInstr->addOperand(Op);
8275 }
8276
8277 // Remove any references to SCC. Vector instructions can't read from it, and
8278 // We're just about to add the implicit use / defs of VCC, and we don't want
8279 // both.
8280 for (MachineOperand &Op : Inst.implicit_operands()) {
8281 if (Op.getReg() == AMDGPU::SCC) {
8282 // Only propagate through live-def of SCC.
8283 if (Op.isDef() && !Op.isDead())
8284 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8285 if (Op.isUse())
8286 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8287 }
8288 }
8289 Inst.eraseFromParent();
8290 Register NewDstReg;
8291 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8292 Register DstReg = NewInstr->getOperand(0).getReg();
8293 assert(DstReg.isVirtual());
8294 // Update the destination register class.
8295 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8296 assert(NewDstRC);
8297 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8298 MRI.replaceRegWith(DstReg, NewDstReg);
8299 }
8300 fixImplicitOperands(*NewInstr);
8301
8302 legalizeOperandsVALUt16(*NewInstr, MRI);
8303
8304 // Legalize the operands
8305 legalizeOperands(*NewInstr, MDT);
8306 if (NewDstReg)
8307 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8308}
8309
8310// Add/sub require special handling to deal with carry outs.
8311std::pair<bool, MachineBasicBlock *>
8312SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8313 MachineDominatorTree *MDT) const {
8314 if (ST.hasAddNoCarry()) {
8315 // Assume there is no user of scc since we don't select this in that case.
8316 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8317 // is used.
8318
8319 MachineBasicBlock &MBB = *Inst.getParent();
8320 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8321
8322 Register OldDstReg = Inst.getOperand(0).getReg();
8323 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8324
8325 unsigned Opc = Inst.getOpcode();
8326 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8327
8328 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8329 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8330
8331 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8332 Inst.removeOperand(3);
8333
8334 Inst.setDesc(get(NewOpc));
8335 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8336 Inst.addImplicitDefUseOperands(*MBB.getParent());
8337 MRI.replaceRegWith(OldDstReg, ResultReg);
8338 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8339
8340 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8341 return std::pair(true, NewBB);
8342 }
8343
8344 return std::pair(false, nullptr);
8345}
8346
8347void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8348 MachineDominatorTree *MDT) const {
8349
8350 MachineBasicBlock &MBB = *Inst.getParent();
8351 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8352 MachineBasicBlock::iterator MII = Inst;
8353 DebugLoc DL = Inst.getDebugLoc();
8354
8355 MachineOperand &Dest = Inst.getOperand(0);
8356 MachineOperand &Src0 = Inst.getOperand(1);
8357 MachineOperand &Src1 = Inst.getOperand(2);
8358 MachineOperand &Cond = Inst.getOperand(3);
8359
8360 Register CondReg = Cond.getReg();
8361 bool IsSCC = (CondReg == AMDGPU::SCC);
8362
8363 // If this is a trivial select where the condition is effectively not SCC
8364 // (CondReg is a source of copy to SCC), then the select is semantically
8365 // equivalent to copying CondReg. Hence, there is no need to create
8366 // V_CNDMASK, we can just use that and bail out.
8367 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8368 (Src1.getImm() == 0)) {
8369 MRI.replaceRegWith(Dest.getReg(), CondReg);
8370 return;
8371 }
8372
8373 Register NewCondReg = CondReg;
8374 if (IsSCC) {
8375 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8376 NewCondReg = MRI.createVirtualRegister(TC);
8377
8378 // Now look for the closest SCC def if it is a copy
8379 // replacing the CondReg with the COPY source register
8380 bool CopyFound = false;
8381 for (MachineInstr &CandI :
8383 Inst.getParent()->rend())) {
8384 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8385 -1) {
8386 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8387 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8388 .addReg(CandI.getOperand(1).getReg());
8389 CopyFound = true;
8390 }
8391 break;
8392 }
8393 }
8394 if (!CopyFound) {
8395 // SCC def is not a copy
8396 // Insert a trivial select instead of creating a copy, because a copy from
8397 // SCC would semantically mean just copying a single bit, but we may need
8398 // the result to be a vector condition mask that needs preserving.
8399 unsigned Opcode =
8400 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8401 auto NewSelect =
8402 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8403 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8404 }
8405 }
8406
8407 Register NewDestReg = MRI.createVirtualRegister(
8408 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8409 MachineInstr *NewInst;
8410 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8411 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8412 .addImm(0)
8413 .add(Src1) // False
8414 .addImm(0)
8415 .add(Src0) // True
8416 .addReg(NewCondReg);
8417 } else {
8418 NewInst =
8419 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8420 .add(Src1) // False
8421 .add(Src0) // True
8422 .addReg(NewCondReg);
8423 }
8424 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8425 legalizeOperands(*NewInst, MDT);
8426 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8427}
8428
8429void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8430 MachineInstr &Inst) const {
8431 MachineBasicBlock &MBB = *Inst.getParent();
8432 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8433 MachineBasicBlock::iterator MII = Inst;
8434 DebugLoc DL = Inst.getDebugLoc();
8435
8436 MachineOperand &Dest = Inst.getOperand(0);
8437 MachineOperand &Src = Inst.getOperand(1);
8438 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8439 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8440
8441 unsigned SubOp = ST.hasAddNoCarry() ?
8442 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8443
8444 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8445 .addImm(0)
8446 .addReg(Src.getReg());
8447
8448 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8449 .addReg(Src.getReg())
8450 .addReg(TmpReg);
8451
8452 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8453 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8454}
8455
8456void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8457 MachineInstr &Inst) const {
8458 MachineBasicBlock &MBB = *Inst.getParent();
8459 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8460 MachineBasicBlock::iterator MII = Inst;
8461 const DebugLoc &DL = Inst.getDebugLoc();
8462
8463 MachineOperand &Dest = Inst.getOperand(0);
8464 MachineOperand &Src0 = Inst.getOperand(1);
8465 MachineOperand &Src1 = Inst.getOperand(2);
8466
8467 if (ST.hasDLInsts()) {
8468 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8469 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8470 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8471
8472 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8473 .add(Src0)
8474 .add(Src1);
8475
8476 MRI.replaceRegWith(Dest.getReg(), NewDest);
8477 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8478 } else {
8479 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8480 // invert either source and then perform the XOR. If either source is a
8481 // scalar register, then we can leave the inversion on the scalar unit to
8482 // achieve a better distribution of scalar and vector instructions.
8483 bool Src0IsSGPR = Src0.isReg() &&
8484 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8485 bool Src1IsSGPR = Src1.isReg() &&
8486 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8487 MachineInstr *Xor;
8488 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8489 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8490
8491 // Build a pair of scalar instructions and add them to the work list.
8492 // The next iteration over the work list will lower these to the vector
8493 // unit as necessary.
8494 if (Src0IsSGPR) {
8495 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8496 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8497 .addReg(Temp)
8498 .add(Src1);
8499 } else if (Src1IsSGPR) {
8500 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8501 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8502 .add(Src0)
8503 .addReg(Temp);
8504 } else {
8505 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8506 .add(Src0)
8507 .add(Src1);
8508 MachineInstr *Not =
8509 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8510 Worklist.insert(Not);
8511 }
8512
8513 MRI.replaceRegWith(Dest.getReg(), NewDest);
8514
8515 Worklist.insert(Xor);
8516
8517 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8518 }
8519}
8520
8521void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8522 MachineInstr &Inst,
8523 unsigned Opcode) const {
8524 MachineBasicBlock &MBB = *Inst.getParent();
8525 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8526 MachineBasicBlock::iterator MII = Inst;
8527 const DebugLoc &DL = Inst.getDebugLoc();
8528
8529 MachineOperand &Dest = Inst.getOperand(0);
8530 MachineOperand &Src0 = Inst.getOperand(1);
8531 MachineOperand &Src1 = Inst.getOperand(2);
8532
8533 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8534 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8535
8536 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8537 .add(Src0)
8538 .add(Src1);
8539
8540 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8541 .addReg(Interm);
8542
8543 Worklist.insert(&Op);
8544 Worklist.insert(&Not);
8545
8546 MRI.replaceRegWith(Dest.getReg(), NewDest);
8547 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8548}
8549
8550void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8551 MachineInstr &Inst,
8552 unsigned Opcode) const {
8553 MachineBasicBlock &MBB = *Inst.getParent();
8554 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8555 MachineBasicBlock::iterator MII = Inst;
8556 const DebugLoc &DL = Inst.getDebugLoc();
8557
8558 MachineOperand &Dest = Inst.getOperand(0);
8559 MachineOperand &Src0 = Inst.getOperand(1);
8560 MachineOperand &Src1 = Inst.getOperand(2);
8561
8562 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8563 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8564
8565 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8566 .add(Src1);
8567
8568 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8569 .add(Src0)
8570 .addReg(Interm);
8571
8572 Worklist.insert(&Not);
8573 Worklist.insert(&Op);
8574
8575 MRI.replaceRegWith(Dest.getReg(), NewDest);
8576 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8577}
8578
8579void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8580 MachineInstr &Inst, unsigned Opcode,
8581 bool Swap) const {
8582 MachineBasicBlock &MBB = *Inst.getParent();
8583 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8584
8585 MachineOperand &Dest = Inst.getOperand(0);
8586 MachineOperand &Src0 = Inst.getOperand(1);
8587 DebugLoc DL = Inst.getDebugLoc();
8588
8589 MachineBasicBlock::iterator MII = Inst;
8590
8591 const MCInstrDesc &InstDesc = get(Opcode);
8592 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8593 MRI.getRegClass(Src0.getReg()) :
8594 &AMDGPU::SGPR_32RegClass;
8595
8596 const TargetRegisterClass *Src0SubRC =
8597 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8598
8599 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8600 AMDGPU::sub0, Src0SubRC);
8601
8602 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8603 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8604 const TargetRegisterClass *NewDestSubRC =
8605 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8606
8607 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8608 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8609
8610 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8611 AMDGPU::sub1, Src0SubRC);
8612
8613 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8614 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8615
8616 if (Swap)
8617 std::swap(DestSub0, DestSub1);
8618
8619 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8620 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8621 .addReg(DestSub0)
8622 .addImm(AMDGPU::sub0)
8623 .addReg(DestSub1)
8624 .addImm(AMDGPU::sub1);
8625
8626 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8627
8628 Worklist.insert(&LoHalf);
8629 Worklist.insert(&HiHalf);
8630
8631 // We don't need to legalizeOperands here because for a single operand, src0
8632 // will support any kind of input.
8633
8634 // Move all users of this moved value.
8635 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8636}
8637
8638// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8639// split the s_mul_u64 in 32-bit vector multiplications.
8640void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8641 MachineInstr &Inst,
8642 MachineDominatorTree *MDT) const {
8643 MachineBasicBlock &MBB = *Inst.getParent();
8644 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8645
8646 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8647 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8648 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8649
8650 MachineOperand &Dest = Inst.getOperand(0);
8651 MachineOperand &Src0 = Inst.getOperand(1);
8652 MachineOperand &Src1 = Inst.getOperand(2);
8653 const DebugLoc &DL = Inst.getDebugLoc();
8654 MachineBasicBlock::iterator MII = Inst;
8655
8656 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8657 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8658 const TargetRegisterClass *Src0SubRC =
8659 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8660 if (RI.isSGPRClass(Src0SubRC))
8661 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8662 const TargetRegisterClass *Src1SubRC =
8663 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8664 if (RI.isSGPRClass(Src1SubRC))
8665 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8666
8667 // First, we extract the low 32-bit and high 32-bit values from each of the
8668 // operands.
8669 MachineOperand Op0L =
8670 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8671 MachineOperand Op1L =
8672 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8673 MachineOperand Op0H =
8674 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8675 MachineOperand Op1H =
8676 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8677
8678 // The multilication is done as follows:
8679 //
8680 // Op1H Op1L
8681 // * Op0H Op0L
8682 // --------------------
8683 // Op1H*Op0L Op1L*Op0L
8684 // + Op1H*Op0H Op1L*Op0H
8685 // -----------------------------------------
8686 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8687 //
8688 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8689 // value and that would overflow.
8690 // The low 32-bit value is Op1L*Op0L.
8691 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8692
8693 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8694 MachineInstr *Op1L_Op0H =
8695 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8696 .add(Op1L)
8697 .add(Op0H);
8698
8699 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8700 MachineInstr *Op1H_Op0L =
8701 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8702 .add(Op1H)
8703 .add(Op0L);
8704
8705 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8706 MachineInstr *Carry =
8707 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8708 .add(Op1L)
8709 .add(Op0L);
8710
8711 MachineInstr *LoHalf =
8712 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8713 .add(Op1L)
8714 .add(Op0L);
8715
8716 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8717 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8718 .addReg(Op1L_Op0H_Reg)
8719 .addReg(Op1H_Op0L_Reg);
8720
8721 MachineInstr *HiHalf =
8722 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8723 .addReg(AddReg)
8724 .addReg(CarryReg);
8725
8726 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8727 .addReg(DestSub0)
8728 .addImm(AMDGPU::sub0)
8729 .addReg(DestSub1)
8730 .addImm(AMDGPU::sub1);
8731
8732 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8733
8734 // Try to legalize the operands in case we need to swap the order to keep it
8735 // valid.
8736 legalizeOperands(*Op1L_Op0H, MDT);
8737 legalizeOperands(*Op1H_Op0L, MDT);
8738 legalizeOperands(*Carry, MDT);
8739 legalizeOperands(*LoHalf, MDT);
8740 legalizeOperands(*Add, MDT);
8741 legalizeOperands(*HiHalf, MDT);
8742
8743 // Move all users of this moved value.
8744 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8745}
8746
8747// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8748// multiplications.
8749void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8750 MachineInstr &Inst,
8751 MachineDominatorTree *MDT) const {
8752 MachineBasicBlock &MBB = *Inst.getParent();
8753 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8754
8755 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8756 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8757 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8758
8759 MachineOperand &Dest = Inst.getOperand(0);
8760 MachineOperand &Src0 = Inst.getOperand(1);
8761 MachineOperand &Src1 = Inst.getOperand(2);
8762 const DebugLoc &DL = Inst.getDebugLoc();
8763 MachineBasicBlock::iterator MII = Inst;
8764
8765 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8766 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8767 const TargetRegisterClass *Src0SubRC =
8768 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8769 if (RI.isSGPRClass(Src0SubRC))
8770 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8771 const TargetRegisterClass *Src1SubRC =
8772 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8773 if (RI.isSGPRClass(Src1SubRC))
8774 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8775
8776 // First, we extract the low 32-bit and high 32-bit values from each of the
8777 // operands.
8778 MachineOperand Op0L =
8779 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8780 MachineOperand Op1L =
8781 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8782
8783 unsigned Opc = Inst.getOpcode();
8784 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8785 ? AMDGPU::V_MUL_HI_U32_e64
8786 : AMDGPU::V_MUL_HI_I32_e64;
8787 MachineInstr *HiHalf =
8788 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8789
8790 MachineInstr *LoHalf =
8791 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8792 .add(Op1L)
8793 .add(Op0L);
8794
8795 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8796 .addReg(DestSub0)
8797 .addImm(AMDGPU::sub0)
8798 .addReg(DestSub1)
8799 .addImm(AMDGPU::sub1);
8800
8801 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8802
8803 // Try to legalize the operands in case we need to swap the order to keep it
8804 // valid.
8805 legalizeOperands(*HiHalf, MDT);
8806 legalizeOperands(*LoHalf, MDT);
8807
8808 // Move all users of this moved value.
8809 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8810}
8811
8812void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8813 MachineInstr &Inst, unsigned Opcode,
8814 MachineDominatorTree *MDT) const {
8815 MachineBasicBlock &MBB = *Inst.getParent();
8816 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8817
8818 MachineOperand &Dest = Inst.getOperand(0);
8819 MachineOperand &Src0 = Inst.getOperand(1);
8820 MachineOperand &Src1 = Inst.getOperand(2);
8821 DebugLoc DL = Inst.getDebugLoc();
8822
8823 MachineBasicBlock::iterator MII = Inst;
8824
8825 const MCInstrDesc &InstDesc = get(Opcode);
8826 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8827 MRI.getRegClass(Src0.getReg()) :
8828 &AMDGPU::SGPR_32RegClass;
8829
8830 const TargetRegisterClass *Src0SubRC =
8831 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8832 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8833 MRI.getRegClass(Src1.getReg()) :
8834 &AMDGPU::SGPR_32RegClass;
8835
8836 const TargetRegisterClass *Src1SubRC =
8837 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8838
8839 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8840 AMDGPU::sub0, Src0SubRC);
8841 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8842 AMDGPU::sub0, Src1SubRC);
8843 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8844 AMDGPU::sub1, Src0SubRC);
8845 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8846 AMDGPU::sub1, Src1SubRC);
8847
8848 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8849 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8850 const TargetRegisterClass *NewDestSubRC =
8851 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8852
8853 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8854 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8855 .add(SrcReg0Sub0)
8856 .add(SrcReg1Sub0);
8857
8858 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8859 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8860 .add(SrcReg0Sub1)
8861 .add(SrcReg1Sub1);
8862
8863 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8864 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8865 .addReg(DestSub0)
8866 .addImm(AMDGPU::sub0)
8867 .addReg(DestSub1)
8868 .addImm(AMDGPU::sub1);
8869
8870 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8871
8872 Worklist.insert(&LoHalf);
8873 Worklist.insert(&HiHalf);
8874
8875 // Move all users of this moved value.
8876 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8877}
8878
8879void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8880 MachineInstr &Inst,
8881 MachineDominatorTree *MDT) const {
8882 MachineBasicBlock &MBB = *Inst.getParent();
8883 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8884
8885 MachineOperand &Dest = Inst.getOperand(0);
8886 MachineOperand &Src0 = Inst.getOperand(1);
8887 MachineOperand &Src1 = Inst.getOperand(2);
8888 const DebugLoc &DL = Inst.getDebugLoc();
8889
8890 MachineBasicBlock::iterator MII = Inst;
8891
8892 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8893
8894 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8895
8896 MachineOperand* Op0;
8897 MachineOperand* Op1;
8898
8899 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8900 Op0 = &Src0;
8901 Op1 = &Src1;
8902 } else {
8903 Op0 = &Src1;
8904 Op1 = &Src0;
8905 }
8906
8907 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8908 .add(*Op0);
8909
8910 Register NewDest = MRI.createVirtualRegister(DestRC);
8911
8912 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8913 .addReg(Interm)
8914 .add(*Op1);
8915
8916 MRI.replaceRegWith(Dest.getReg(), NewDest);
8917
8918 Worklist.insert(&Xor);
8919}
8920
8921void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8922 MachineInstr &Inst) const {
8923 MachineBasicBlock &MBB = *Inst.getParent();
8924 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8925
8926 MachineBasicBlock::iterator MII = Inst;
8927 const DebugLoc &DL = Inst.getDebugLoc();
8928
8929 MachineOperand &Dest = Inst.getOperand(0);
8930 MachineOperand &Src = Inst.getOperand(1);
8931
8932 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8933 const TargetRegisterClass *SrcRC = Src.isReg() ?
8934 MRI.getRegClass(Src.getReg()) :
8935 &AMDGPU::SGPR_32RegClass;
8936
8937 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8938 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8939
8940 const TargetRegisterClass *SrcSubRC =
8941 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8942
8943 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8944 AMDGPU::sub0, SrcSubRC);
8945 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8946 AMDGPU::sub1, SrcSubRC);
8947
8948 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8949
8950 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8951
8952 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8953
8954 // We don't need to legalize operands here. src0 for either instruction can be
8955 // an SGPR, and the second input is unused or determined here.
8956 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8957}
8958
8959void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8960 MachineInstr &Inst) const {
8961 MachineBasicBlock &MBB = *Inst.getParent();
8962 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8963 MachineBasicBlock::iterator MII = Inst;
8964 const DebugLoc &DL = Inst.getDebugLoc();
8965
8966 MachineOperand &Dest = Inst.getOperand(0);
8967 uint32_t Imm = Inst.getOperand(2).getImm();
8968 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8969 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8970
8971 (void) Offset;
8972
8973 // Only sext_inreg cases handled.
8974 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8975 Offset == 0 && "Not implemented");
8976
8977 if (BitWidth < 32) {
8978 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8979 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8980 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8981
8982 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8983 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8984 .addImm(0)
8985 .addImm(BitWidth);
8986
8987 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8988 .addImm(31)
8989 .addReg(MidRegLo);
8990
8991 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8992 .addReg(MidRegLo)
8993 .addImm(AMDGPU::sub0)
8994 .addReg(MidRegHi)
8995 .addImm(AMDGPU::sub1);
8996
8997 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8998 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8999 return;
9000 }
9001
9002 MachineOperand &Src = Inst.getOperand(1);
9003 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9004 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9005
9006 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9007 .addImm(31)
9008 .addReg(Src.getReg(), 0, AMDGPU::sub0);
9009
9010 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9011 .addReg(Src.getReg(), 0, AMDGPU::sub0)
9012 .addImm(AMDGPU::sub0)
9013 .addReg(TmpReg)
9014 .addImm(AMDGPU::sub1);
9015
9016 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9017 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9018}
9019
9020void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9021 MachineInstr &Inst, unsigned Opcode,
9022 MachineDominatorTree *MDT) const {
9023 // (S_FLBIT_I32_B64 hi:lo) ->
9024 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9025 // (S_FF1_I32_B64 hi:lo) ->
9026 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9027
9028 MachineBasicBlock &MBB = *Inst.getParent();
9029 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9030 MachineBasicBlock::iterator MII = Inst;
9031 const DebugLoc &DL = Inst.getDebugLoc();
9032
9033 MachineOperand &Dest = Inst.getOperand(0);
9034 MachineOperand &Src = Inst.getOperand(1);
9035
9036 const MCInstrDesc &InstDesc = get(Opcode);
9037
9038 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9039 unsigned OpcodeAdd =
9040 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
9041
9042 const TargetRegisterClass *SrcRC =
9043 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9044 const TargetRegisterClass *SrcSubRC =
9045 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9046
9047 MachineOperand SrcRegSub0 =
9048 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9049 MachineOperand SrcRegSub1 =
9050 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9051
9052 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9053 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9054 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9055 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9056
9057 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9058
9059 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9060
9061 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9062 .addReg(IsCtlz ? MidReg1 : MidReg2)
9063 .addImm(32)
9064 .addImm(1); // enable clamp
9065
9066 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9067 .addReg(MidReg3)
9068 .addReg(IsCtlz ? MidReg2 : MidReg1);
9069
9070 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9071
9072 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9073}
9074
9075void SIInstrInfo::addUsersToMoveToVALUWorklist(
9077 SIInstrWorklist &Worklist) const {
9078 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9079 MachineInstr &UseMI = *MO.getParent();
9080
9081 unsigned OpNo = 0;
9082
9083 switch (UseMI.getOpcode()) {
9084 case AMDGPU::COPY:
9085 case AMDGPU::WQM:
9086 case AMDGPU::SOFT_WQM:
9087 case AMDGPU::STRICT_WWM:
9088 case AMDGPU::STRICT_WQM:
9089 case AMDGPU::REG_SEQUENCE:
9090 case AMDGPU::PHI:
9091 case AMDGPU::INSERT_SUBREG:
9092 break;
9093 default:
9094 OpNo = MO.getOperandNo();
9095 break;
9096 }
9097
9098 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9099 MRI.constrainRegClass(DstReg, OpRC);
9100
9101 if (!RI.hasVectorRegisters(OpRC))
9102 Worklist.insert(&UseMI);
9103 else
9104 // Legalization could change user list.
9106 }
9107}
9108
9109void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9111 MachineInstr &Inst) const {
9112 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9113 MachineBasicBlock *MBB = Inst.getParent();
9114 MachineOperand &Src0 = Inst.getOperand(1);
9115 MachineOperand &Src1 = Inst.getOperand(2);
9116 const DebugLoc &DL = Inst.getDebugLoc();
9117
9118 switch (Inst.getOpcode()) {
9119 case AMDGPU::S_PACK_LL_B32_B16: {
9120 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9121 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9122
9123 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9124 // 0.
9125 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9126 .addImm(0xffff);
9127
9128 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9129 .addReg(ImmReg, RegState::Kill)
9130 .add(Src0);
9131
9132 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9133 .add(Src1)
9134 .addImm(16)
9135 .addReg(TmpReg, RegState::Kill);
9136 break;
9137 }
9138 case AMDGPU::S_PACK_LH_B32_B16: {
9139 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9140 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9141 .addImm(0xffff);
9142 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9143 .addReg(ImmReg, RegState::Kill)
9144 .add(Src0)
9145 .add(Src1);
9146 break;
9147 }
9148 case AMDGPU::S_PACK_HL_B32_B16: {
9149 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9150 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9151 .addImm(16)
9152 .add(Src0);
9153 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9154 .add(Src1)
9155 .addImm(16)
9156 .addReg(TmpReg, RegState::Kill);
9157 break;
9158 }
9159 case AMDGPU::S_PACK_HH_B32_B16: {
9160 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9161 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9162 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9163 .addImm(16)
9164 .add(Src0);
9165 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9166 .addImm(0xffff0000);
9167 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9168 .add(Src1)
9169 .addReg(ImmReg, RegState::Kill)
9170 .addReg(TmpReg, RegState::Kill);
9171 break;
9172 }
9173 default:
9174 llvm_unreachable("unhandled s_pack_* instruction");
9175 }
9176
9177 MachineOperand &Dest = Inst.getOperand(0);
9178 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9179 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9180}
9181
9182void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
9183 MachineInstr &SCCDefInst,
9184 SIInstrWorklist &Worklist,
9185 Register NewCond) const {
9186
9187 // Ensure that def inst defines SCC, which is still live.
9188 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9189 !Op.isDead() && Op.getParent() == &SCCDefInst);
9190 SmallVector<MachineInstr *, 4> CopyToDelete;
9191 // This assumes that all the users of SCC are in the same block
9192 // as the SCC def.
9193 for (MachineInstr &MI : // Skip the def inst itself.
9194 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9195 SCCDefInst.getParent()->end())) {
9196 // Check if SCC is used first.
9197 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9198 if (SCCIdx != -1) {
9199 if (MI.isCopy()) {
9200 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9201 Register DestReg = MI.getOperand(0).getReg();
9202
9203 MRI.replaceRegWith(DestReg, NewCond);
9204 CopyToDelete.push_back(&MI);
9205 } else {
9206
9207 if (NewCond.isValid())
9208 MI.getOperand(SCCIdx).setReg(NewCond);
9209
9210 Worklist.insert(&MI);
9211 }
9212 }
9213 // Exit if we find another SCC def.
9214 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9215 break;
9216 }
9217 for (auto &Copy : CopyToDelete)
9218 Copy->eraseFromParent();
9219}
9220
9221// Instructions that use SCC may be converted to VALU instructions. When that
9222// happens, the SCC register is changed to VCC_LO. The instruction that defines
9223// SCC must be changed to an instruction that defines VCC. This function makes
9224// sure that the instruction that defines SCC is added to the moveToVALU
9225// worklist.
9226void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9227 SIInstrWorklist &Worklist) const {
9228 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9229 // then there is nothing to do because the defining instruction has been
9230 // converted to a VALU already. If SCC then that instruction needs to be
9231 // converted to a VALU.
9232 for (MachineInstr &MI :
9233 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9234 SCCUseInst->getParent()->rend())) {
9235 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9236 break;
9237 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9238 Worklist.insert(&MI);
9239 break;
9240 }
9241 }
9242}
9243
9244const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9245 const MachineInstr &Inst) const {
9246 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9247
9248 switch (Inst.getOpcode()) {
9249 // For target instructions, getOpRegClass just returns the virtual register
9250 // class associated with the operand, so we need to find an equivalent VGPR
9251 // register class in order to move the instruction to the VALU.
9252 case AMDGPU::COPY:
9253 case AMDGPU::PHI:
9254 case AMDGPU::REG_SEQUENCE:
9255 case AMDGPU::INSERT_SUBREG:
9256 case AMDGPU::WQM:
9257 case AMDGPU::SOFT_WQM:
9258 case AMDGPU::STRICT_WWM:
9259 case AMDGPU::STRICT_WQM: {
9260 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9261 if (RI.isAGPRClass(SrcRC)) {
9262 if (RI.isAGPRClass(NewDstRC))
9263 return nullptr;
9264
9265 switch (Inst.getOpcode()) {
9266 case AMDGPU::PHI:
9267 case AMDGPU::REG_SEQUENCE:
9268 case AMDGPU::INSERT_SUBREG:
9269 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9270 break;
9271 default:
9272 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9273 }
9274
9275 if (!NewDstRC)
9276 return nullptr;
9277 } else {
9278 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9279 return nullptr;
9280
9281 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9282 if (!NewDstRC)
9283 return nullptr;
9284 }
9285
9286 return NewDstRC;
9287 }
9288 default:
9289 return NewDstRC;
9290 }
9291}
9292
9293// Find the one SGPR operand we are allowed to use.
9294Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9295 int OpIndices[3]) const {
9296 const MCInstrDesc &Desc = MI.getDesc();
9297
9298 // Find the one SGPR operand we are allowed to use.
9299 //
9300 // First we need to consider the instruction's operand requirements before
9301 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9302 // of VCC, but we are still bound by the constant bus requirement to only use
9303 // one.
9304 //
9305 // If the operand's class is an SGPR, we can never move it.
9306
9307 Register SGPRReg = findImplicitSGPRRead(MI);
9308 if (SGPRReg)
9309 return SGPRReg;
9310
9311 Register UsedSGPRs[3] = {Register()};
9312 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9313
9314 for (unsigned i = 0; i < 3; ++i) {
9315 int Idx = OpIndices[i];
9316 if (Idx == -1)
9317 break;
9318
9319 const MachineOperand &MO = MI.getOperand(Idx);
9320 if (!MO.isReg())
9321 continue;
9322
9323 // Is this operand statically required to be an SGPR based on the operand
9324 // constraints?
9325 const TargetRegisterClass *OpRC =
9326 RI.getRegClass(Desc.operands()[Idx].RegClass);
9327 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9328 if (IsRequiredSGPR)
9329 return MO.getReg();
9330
9331 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9332 Register Reg = MO.getReg();
9333 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9334 if (RI.isSGPRClass(RegRC))
9335 UsedSGPRs[i] = Reg;
9336 }
9337
9338 // We don't have a required SGPR operand, so we have a bit more freedom in
9339 // selecting operands to move.
9340
9341 // Try to select the most used SGPR. If an SGPR is equal to one of the
9342 // others, we choose that.
9343 //
9344 // e.g.
9345 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9346 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9347
9348 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9349 // prefer those.
9350
9351 if (UsedSGPRs[0]) {
9352 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9353 SGPRReg = UsedSGPRs[0];
9354 }
9355
9356 if (!SGPRReg && UsedSGPRs[1]) {
9357 if (UsedSGPRs[1] == UsedSGPRs[2])
9358 SGPRReg = UsedSGPRs[1];
9359 }
9360
9361 return SGPRReg;
9362}
9363
9365 AMDGPU::OpName OperandName) const {
9366 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9367 return nullptr;
9368
9369 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9370 if (Idx == -1)
9371 return nullptr;
9372
9373 return &MI.getOperand(Idx);
9374}
9375
9377 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9378 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9381 return (Format << 44) |
9382 (1ULL << 56) | // RESOURCE_LEVEL = 1
9383 (3ULL << 60); // OOB_SELECT = 3
9384 }
9385
9386 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9387 if (ST.isAmdHsaOS()) {
9388 // Set ATC = 1. GFX9 doesn't have this bit.
9389 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9390 RsrcDataFormat |= (1ULL << 56);
9391
9392 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9393 // BTW, it disables TC L2 and therefore decreases performance.
9394 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9395 RsrcDataFormat |= (2ULL << 59);
9396 }
9397
9398 return RsrcDataFormat;
9399}
9400
9404 0xffffffff; // Size;
9405
9406 // GFX9 doesn't have ELEMENT_SIZE.
9407 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9408 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9409 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9410 }
9411
9412 // IndexStride = 64 / 32.
9413 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9414 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9415
9416 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9417 // Clear them unless we want a huge stride.
9418 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9419 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9420 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9421
9422 return Rsrc23;
9423}
9424
9426 unsigned Opc = MI.getOpcode();
9427
9428 return isSMRD(Opc);
9429}
9430
9432 return get(Opc).mayLoad() &&
9433 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9434}
9435
9437 int &FrameIndex) const {
9438 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9439 if (!Addr || !Addr->isFI())
9440 return Register();
9441
9442 assert(!MI.memoperands_empty() &&
9443 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9444
9445 FrameIndex = Addr->getIndex();
9446 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9447}
9448
9450 int &FrameIndex) const {
9451 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9452 assert(Addr && Addr->isFI());
9453 FrameIndex = Addr->getIndex();
9454 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9455}
9456
9458 int &FrameIndex) const {
9459 if (!MI.mayLoad())
9460 return Register();
9461
9462 if (isMUBUF(MI) || isVGPRSpill(MI))
9463 return isStackAccess(MI, FrameIndex);
9464
9465 if (isSGPRSpill(MI))
9466 return isSGPRStackAccess(MI, FrameIndex);
9467
9468 return Register();
9469}
9470
9472 int &FrameIndex) const {
9473 if (!MI.mayStore())
9474 return Register();
9475
9476 if (isMUBUF(MI) || isVGPRSpill(MI))
9477 return isStackAccess(MI, FrameIndex);
9478
9479 if (isSGPRSpill(MI))
9480 return isSGPRStackAccess(MI, FrameIndex);
9481
9482 return Register();
9483}
9484
9486 unsigned Size = 0;
9488 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9489 while (++I != E && I->isInsideBundle()) {
9490 assert(!I->isBundle() && "No nested bundle!");
9492 }
9493
9494 return Size;
9495}
9496
9498 unsigned Opc = MI.getOpcode();
9500 unsigned DescSize = Desc.getSize();
9501
9502 // If we have a definitive size, we can use it. Otherwise we need to inspect
9503 // the operands to know the size.
9504 if (isFixedSize(MI)) {
9505 unsigned Size = DescSize;
9506
9507 // If we hit the buggy offset, an extra nop will be inserted in MC so
9508 // estimate the worst case.
9509 if (MI.isBranch() && ST.hasOffset3fBug())
9510 Size += 4;
9511
9512 return Size;
9513 }
9514
9515 // Instructions may have a 32-bit literal encoded after them. Check
9516 // operands that could ever be literals.
9517 if (isVALU(MI) || isSALU(MI)) {
9518 if (isDPP(MI))
9519 return DescSize;
9520 bool HasLiteral = false;
9521 unsigned LiteralSize = 4;
9522 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9523 const MachineOperand &Op = MI.getOperand(I);
9524 const MCOperandInfo &OpInfo = Desc.operands()[I];
9525 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9526 HasLiteral = true;
9527 if (ST.has64BitLiterals()) {
9528 switch (OpInfo.OperandType) {
9529 default:
9530 break;
9532 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9533 LiteralSize = 8;
9534 break;
9536 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9537 LiteralSize = 8;
9538 break;
9539 }
9540 }
9541 break;
9542 }
9543 }
9544 return HasLiteral ? DescSize + LiteralSize : DescSize;
9545 }
9546
9547 // Check whether we have extra NSA words.
9548 if (isMIMG(MI)) {
9549 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9550 if (VAddr0Idx < 0)
9551 return 8;
9552
9553 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9554 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9555 }
9556
9557 switch (Opc) {
9558 case TargetOpcode::BUNDLE:
9559 return getInstBundleSize(MI);
9560 case TargetOpcode::INLINEASM:
9561 case TargetOpcode::INLINEASM_BR: {
9562 const MachineFunction *MF = MI.getParent()->getParent();
9563 const char *AsmStr = MI.getOperand(0).getSymbolName();
9564 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9565 }
9566 default:
9567 if (MI.isMetaInstruction())
9568 return 0;
9569
9570 // If D16 Pseudo inst, get correct MC code size
9571 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9572 if (D16Info) {
9573 // Assume d16_lo/hi inst are always in same size
9574 unsigned LoInstOpcode = D16Info->LoOp;
9575 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9576 DescSize = Desc.getSize();
9577 }
9578
9579 // If FMA Pseudo inst, get correct MC code size
9580 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9581 // All potential lowerings are the same size; arbitrarily pick one.
9582 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9583 DescSize = Desc.getSize();
9584 }
9585
9586 return DescSize;
9587 }
9588}
9589
9591 if (!isFLAT(MI))
9592 return false;
9593
9594 if (MI.memoperands_empty())
9595 return true;
9596
9597 for (const MachineMemOperand *MMO : MI.memoperands()) {
9598 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9599 return true;
9600 }
9601 return false;
9602}
9603
9606 static const std::pair<int, const char *> TargetIndices[] = {
9607 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9608 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9609 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9610 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9611 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9612 return ArrayRef(TargetIndices);
9613}
9614
9615/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9616/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9622
9623/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9624/// pass.
9629
9630// Called during:
9631// - pre-RA scheduling and post-RA scheduling
9634 const ScheduleDAGMI *DAG) const {
9635 // Borrowed from Arm Target
9636 // We would like to restrict this hazard recognizer to only
9637 // post-RA scheduling; we can tell that we're post-RA because we don't
9638 // track VRegLiveness.
9639 if (!DAG->hasVRegLiveness())
9640 return new GCNHazardRecognizer(DAG->MF);
9642}
9643
9644std::pair<unsigned, unsigned>
9646 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9647}
9648
9651 static const std::pair<unsigned, const char *> TargetFlags[] = {
9652 {MO_GOTPCREL, "amdgpu-gotprel"},
9653 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9654 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9655 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9656 {MO_REL32_LO, "amdgpu-rel32-lo"},
9657 {MO_REL32_HI, "amdgpu-rel32-hi"},
9658 {MO_REL64, "amdgpu-rel64"},
9659 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9660 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9661 {MO_ABS64, "amdgpu-abs64"},
9662 };
9663
9664 return ArrayRef(TargetFlags);
9665}
9666
9669 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9670 {
9671 {MONoClobber, "amdgpu-noclobber"},
9672 {MOLastUse, "amdgpu-last-use"},
9673 {MOCooperative, "amdgpu-cooperative"},
9674 };
9675
9676 return ArrayRef(TargetFlags);
9677}
9678
9680 const MachineFunction &MF) const {
9682 assert(SrcReg.isVirtual());
9683 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9684 return AMDGPU::WWM_COPY;
9685
9686 return AMDGPU::COPY;
9687}
9688
9690 Register Reg) const {
9691 // We need to handle instructions which may be inserted during register
9692 // allocation to handle the prolog. The initial prolog instruction may have
9693 // been separated from the start of the block by spills and copies inserted
9694 // needed by the prolog. However, the insertions for scalar registers can
9695 // always be placed at the BB top as they are independent of the exec mask
9696 // value.
9697 const MachineFunction *MF = MI.getParent()->getParent();
9698 bool IsNullOrVectorRegister = true;
9699 if (Reg) {
9700 const MachineRegisterInfo &MRI = MF->getRegInfo();
9701 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9702 }
9703
9704 uint16_t Opcode = MI.getOpcode();
9706 return IsNullOrVectorRegister &&
9707 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9708 (Opcode == AMDGPU::IMPLICIT_DEF &&
9709 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
9710 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9711 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9712}
9713
9717 const DebugLoc &DL,
9718 Register DestReg) const {
9719 if (ST.hasAddNoCarry())
9720 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9721
9722 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9723 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9724 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9725
9726 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9727 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9728}
9729
9732 const DebugLoc &DL,
9733 Register DestReg,
9734 RegScavenger &RS) const {
9735 if (ST.hasAddNoCarry())
9736 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9737
9738 // If available, prefer to use vcc.
9739 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9740 ? Register(RI.getVCC())
9742 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9743 0, /* AllowSpill */ false);
9744
9745 // TODO: Users need to deal with this.
9746 if (!UnusedCarry.isValid())
9747 return MachineInstrBuilder();
9748
9749 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9750 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9751}
9752
9753bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9754 switch (Opcode) {
9755 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9756 case AMDGPU::SI_KILL_I1_TERMINATOR:
9757 return true;
9758 default:
9759 return false;
9760 }
9761}
9762
9764 switch (Opcode) {
9765 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9766 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9767 case AMDGPU::SI_KILL_I1_PSEUDO:
9768 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9769 default:
9770 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9771 }
9772}
9773
9774bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9775 return Imm <= getMaxMUBUFImmOffset(ST);
9776}
9777
9779 // GFX12 field is non-negative 24-bit signed byte offset.
9780 const unsigned OffsetBits =
9781 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9782 return (1 << OffsetBits) - 1;
9783}
9784
9786 if (!ST.isWave32())
9787 return;
9788
9789 if (MI.isInlineAsm())
9790 return;
9791
9792 for (auto &Op : MI.implicit_operands()) {
9793 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9794 Op.setReg(AMDGPU::VCC_LO);
9795 }
9796}
9797
9799 if (!isSMRD(MI))
9800 return false;
9801
9802 // Check that it is using a buffer resource.
9803 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9804 if (Idx == -1) // e.g. s_memtime
9805 return false;
9806
9807 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9808 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9809}
9810
9811// Given Imm, split it into the values to put into the SOffset and ImmOffset
9812// fields in an MUBUF instruction. Return false if it is not possible (due to a
9813// hardware bug needing a workaround).
9814//
9815// The required alignment ensures that individual address components remain
9816// aligned if they are aligned to begin with. It also ensures that additional
9817// offsets within the given alignment can be added to the resulting ImmOffset.
9819 uint32_t &ImmOffset, Align Alignment) const {
9820 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9821 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9822 uint32_t Overflow = 0;
9823
9824 if (Imm > MaxImm) {
9825 if (Imm <= MaxImm + 64) {
9826 // Use an SOffset inline constant for 4..64
9827 Overflow = Imm - MaxImm;
9828 Imm = MaxImm;
9829 } else {
9830 // Try to keep the same value in SOffset for adjacent loads, so that
9831 // the corresponding register contents can be re-used.
9832 //
9833 // Load values with all low-bits (except for alignment bits) set into
9834 // SOffset, so that a larger range of values can be covered using
9835 // s_movk_i32.
9836 //
9837 // Atomic operations fail to work correctly when individual address
9838 // components are unaligned, even if their sum is aligned.
9839 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9840 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9841 Imm = Low;
9842 Overflow = High - Alignment.value();
9843 }
9844 }
9845
9846 if (Overflow > 0) {
9847 // There is a hardware bug in SI and CI which prevents address clamping in
9848 // MUBUF instructions from working correctly with SOffsets. The immediate
9849 // offset is unaffected.
9850 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
9851 return false;
9852
9853 // It is not possible to set immediate in SOffset field on some targets.
9854 if (ST.hasRestrictedSOffset())
9855 return false;
9856 }
9857
9858 ImmOffset = Imm;
9859 SOffset = Overflow;
9860 return true;
9861}
9862
9863// Depending on the used address space and instructions, some immediate offsets
9864// are allowed and some are not.
9865// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9866// scratch instruction offsets can also be negative. On GFX12, offsets can be
9867// negative for all variants.
9868//
9869// There are several bugs related to these offsets:
9870// On gfx10.1, flat instructions that go into the global address space cannot
9871// use an offset.
9872//
9873// For scratch instructions, the address can be either an SGPR or a VGPR.
9874// The following offsets can be used, depending on the architecture (x means
9875// cannot be used):
9876// +----------------------------+------+------+
9877// | Address-Mode | SGPR | VGPR |
9878// +----------------------------+------+------+
9879// | gfx9 | | |
9880// | negative, 4-aligned offset | x | ok |
9881// | negative, unaligned offset | x | ok |
9882// +----------------------------+------+------+
9883// | gfx10 | | |
9884// | negative, 4-aligned offset | ok | ok |
9885// | negative, unaligned offset | ok | x |
9886// +----------------------------+------+------+
9887// | gfx10.3 | | |
9888// | negative, 4-aligned offset | ok | ok |
9889// | negative, unaligned offset | ok | ok |
9890// +----------------------------+------+------+
9891//
9892// This function ignores the addressing mode, so if an offset cannot be used in
9893// one addressing mode, it is considered illegal.
9894bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9895 uint64_t FlatVariant) const {
9896 // TODO: Should 0 be special cased?
9897 if (!ST.hasFlatInstOffsets())
9898 return false;
9899
9900 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9901 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9902 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9903 return false;
9904
9905 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9906 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9907 (Offset % 4) != 0) {
9908 return false;
9909 }
9910
9911 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9912 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9913 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9914}
9915
9916// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9917std::pair<int64_t, int64_t>
9918SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9919 uint64_t FlatVariant) const {
9920 int64_t RemainderOffset = COffsetVal;
9921 int64_t ImmField = 0;
9922
9923 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9924 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9925
9926 if (AllowNegative) {
9927 // Use signed division by a power of two to truncate towards 0.
9928 int64_t D = 1LL << NumBits;
9929 RemainderOffset = (COffsetVal / D) * D;
9930 ImmField = COffsetVal - RemainderOffset;
9931
9932 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9933 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9934 (ImmField % 4) != 0) {
9935 // Make ImmField a multiple of 4
9936 RemainderOffset += ImmField % 4;
9937 ImmField -= ImmField % 4;
9938 }
9939 } else if (COffsetVal >= 0) {
9940 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9941 RemainderOffset = COffsetVal - ImmField;
9942 }
9943
9944 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9945 assert(RemainderOffset + ImmField == COffsetVal);
9946 return {ImmField, RemainderOffset};
9947}
9948
9950 if (ST.hasNegativeScratchOffsetBug() &&
9951 FlatVariant == SIInstrFlags::FlatScratch)
9952 return false;
9953
9954 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9955}
9956
9957static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9958 switch (ST.getGeneration()) {
9959 default:
9960 break;
9963 return SIEncodingFamily::SI;
9966 return SIEncodingFamily::VI;
9972 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
9974 }
9975 llvm_unreachable("Unknown subtarget generation!");
9976}
9977
9978bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9979 switch(MCOp) {
9980 // These opcodes use indirect register addressing so
9981 // they need special handling by codegen (currently missing).
9982 // Therefore it is too risky to allow these opcodes
9983 // to be selected by dpp combiner or sdwa peepholer.
9984 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9985 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9986 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9987 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9988 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9989 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9990 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9991 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9992 return true;
9993 default:
9994 return false;
9995 }
9996}
9997
9998#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
9999 case OPCODE##_dpp: \
10000 case OPCODE##_e32: \
10001 case OPCODE##_e64: \
10002 case OPCODE##_e64_dpp: \
10003 case OPCODE##_sdwa:
10004
10005static bool isRenamedInGFX9(int Opcode) {
10006 switch (Opcode) {
10007 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10008 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10009 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10010 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10011 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10012 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10013 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10014 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10015 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10016 //
10017 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10018 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10019 case AMDGPU::V_FMA_F16_gfx9_e64:
10020 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10021 case AMDGPU::V_INTERP_P2_F16:
10022 case AMDGPU::V_MAD_F16_e64:
10023 case AMDGPU::V_MAD_U16_e64:
10024 case AMDGPU::V_MAD_I16_e64:
10025 return true;
10026 default:
10027 return false;
10028 }
10029}
10030
10031int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10032 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
10033
10034 unsigned Gen = subtargetEncodingFamily(ST);
10035
10036 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10038
10039 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10040 // subtarget has UnpackedD16VMem feature.
10041 // TODO: remove this when we discard GFX80 encoding.
10042 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10044
10045 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10046 switch (ST.getGeneration()) {
10047 default:
10049 break;
10052 break;
10055 break;
10056 }
10057 }
10058
10059 if (isMAI(Opcode)) {
10060 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10061 if (MFMAOp != -1)
10062 Opcode = MFMAOp;
10063 }
10064
10065 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10066
10067 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
10069
10070 // -1 means that Opcode is already a native instruction.
10071 if (MCOp == -1)
10072 return Opcode;
10073
10074 if (ST.hasGFX90AInsts()) {
10075 uint16_t NMCOp = (uint16_t)-1;
10076 if (ST.hasGFX940Insts())
10078 if (NMCOp == (uint16_t)-1)
10080 if (NMCOp == (uint16_t)-1)
10082 if (NMCOp != (uint16_t)-1)
10083 MCOp = NMCOp;
10084 }
10085
10086 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
10087 // no encoding in the given subtarget generation.
10088 if (MCOp == (uint16_t)-1)
10089 return -1;
10090
10091 if (isAsmOnlyOpcode(MCOp))
10092 return -1;
10093
10094 return MCOp;
10095}
10096
10097static
10099 assert(RegOpnd.isReg());
10100 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10101 getRegSubRegPair(RegOpnd);
10102}
10103
10106 assert(MI.isRegSequence());
10107 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10108 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10109 auto &RegOp = MI.getOperand(1 + 2 * I);
10110 return getRegOrUndef(RegOp);
10111 }
10113}
10114
10115// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10116// Following a subreg of reg:subreg isn't supported
10119 if (!RSR.SubReg)
10120 return false;
10121 switch (MI.getOpcode()) {
10122 default: break;
10123 case AMDGPU::REG_SEQUENCE:
10124 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10125 return true;
10126 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10127 case AMDGPU::INSERT_SUBREG:
10128 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10129 // inserted the subreg we're looking for
10130 RSR = getRegOrUndef(MI.getOperand(2));
10131 else { // the subreg in the rest of the reg
10132 auto R1 = getRegOrUndef(MI.getOperand(1));
10133 if (R1.SubReg) // subreg of subreg isn't supported
10134 return false;
10135 RSR.Reg = R1.Reg;
10136 }
10137 return true;
10138 }
10139 return false;
10140}
10141
10144 assert(MRI.isSSA());
10145 if (!P.Reg.isVirtual())
10146 return nullptr;
10147
10148 auto RSR = P;
10149 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10150 while (auto *MI = DefInst) {
10151 DefInst = nullptr;
10152 switch (MI->getOpcode()) {
10153 case AMDGPU::COPY:
10154 case AMDGPU::V_MOV_B32_e32: {
10155 auto &Op1 = MI->getOperand(1);
10156 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10157 if (Op1.isUndef())
10158 return nullptr;
10159 RSR = getRegSubRegPair(Op1);
10160 DefInst = MRI.getVRegDef(RSR.Reg);
10161 }
10162 break;
10163 }
10164 default:
10165 if (followSubRegDef(*MI, RSR)) {
10166 if (!RSR.Reg)
10167 return nullptr;
10168 DefInst = MRI.getVRegDef(RSR.Reg);
10169 }
10170 }
10171 if (!DefInst)
10172 return MI;
10173 }
10174 return nullptr;
10175}
10176
10178 Register VReg,
10179 const MachineInstr &DefMI,
10180 const MachineInstr &UseMI) {
10181 assert(MRI.isSSA() && "Must be run on SSA");
10182
10183 auto *TRI = MRI.getTargetRegisterInfo();
10184 auto *DefBB = DefMI.getParent();
10185
10186 // Don't bother searching between blocks, although it is possible this block
10187 // doesn't modify exec.
10188 if (UseMI.getParent() != DefBB)
10189 return true;
10190
10191 const int MaxInstScan = 20;
10192 int NumInst = 0;
10193
10194 // Stop scan at the use.
10195 auto E = UseMI.getIterator();
10196 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10197 if (I->isDebugInstr())
10198 continue;
10199
10200 if (++NumInst > MaxInstScan)
10201 return true;
10202
10203 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10204 return true;
10205 }
10206
10207 return false;
10208}
10209
10211 Register VReg,
10212 const MachineInstr &DefMI) {
10213 assert(MRI.isSSA() && "Must be run on SSA");
10214
10215 auto *TRI = MRI.getTargetRegisterInfo();
10216 auto *DefBB = DefMI.getParent();
10217
10218 const int MaxUseScan = 10;
10219 int NumUse = 0;
10220
10221 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10222 auto &UseInst = *Use.getParent();
10223 // Don't bother searching between blocks, although it is possible this block
10224 // doesn't modify exec.
10225 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10226 return true;
10227
10228 if (++NumUse > MaxUseScan)
10229 return true;
10230 }
10231
10232 if (NumUse == 0)
10233 return false;
10234
10235 const int MaxInstScan = 20;
10236 int NumInst = 0;
10237
10238 // Stop scan when we have seen all the uses.
10239 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10240 assert(I != DefBB->end());
10241
10242 if (I->isDebugInstr())
10243 continue;
10244
10245 if (++NumInst > MaxInstScan)
10246 return true;
10247
10248 for (const MachineOperand &Op : I->operands()) {
10249 // We don't check reg masks here as they're used only on calls:
10250 // 1. EXEC is only considered const within one BB
10251 // 2. Call should be a terminator instruction if present in a BB
10252
10253 if (!Op.isReg())
10254 continue;
10255
10256 Register Reg = Op.getReg();
10257 if (Op.isUse()) {
10258 if (Reg == VReg && --NumUse == 0)
10259 return false;
10260 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10261 return true;
10262 }
10263 }
10264}
10265
10268 const DebugLoc &DL, Register Src, Register Dst) const {
10269 auto Cur = MBB.begin();
10270 if (Cur != MBB.end())
10271 do {
10272 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10273 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10274 ++Cur;
10275 } while (Cur != MBB.end() && Cur != LastPHIIt);
10276
10277 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10278 Dst);
10279}
10280
10283 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10284 if (InsPt != MBB.end() &&
10285 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10286 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10287 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10288 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10289 InsPt++;
10290 return BuildMI(MBB, InsPt, DL,
10291 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10292 .addReg(Src, 0, SrcSubReg)
10293 .addReg(AMDGPU::EXEC, RegState::Implicit);
10294 }
10295 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10296 Dst);
10297}
10298
10299bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10300
10303 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10304 VirtRegMap *VRM) const {
10305 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10306 //
10307 // %0:sreg_32 = COPY $m0
10308 //
10309 // We explicitly chose SReg_32 for the virtual register so such a copy might
10310 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10311 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10312 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10313 // TargetInstrInfo::foldMemoryOperand() is going to try.
10314 // A similar issue also exists with spilling and reloading $exec registers.
10315 //
10316 // To prevent that, constrain the %0 register class here.
10317 if (isFullCopyInstr(MI)) {
10318 Register DstReg = MI.getOperand(0).getReg();
10319 Register SrcReg = MI.getOperand(1).getReg();
10320 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10321 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10323 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10324 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10325 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10326 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10327 return nullptr;
10328 }
10329 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10330 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10331 return nullptr;
10332 }
10333 }
10334 }
10335
10336 return nullptr;
10337}
10338
10340 const MachineInstr &MI,
10341 unsigned *PredCost) const {
10342 if (MI.isBundle()) {
10344 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10345 unsigned Lat = 0, Count = 0;
10346 for (++I; I != E && I->isBundledWithPred(); ++I) {
10347 ++Count;
10348 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10349 }
10350 return Lat + Count - 1;
10351 }
10352
10353 return SchedModel.computeInstrLatency(&MI);
10354}
10355
10358 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10359 unsigned Opcode = MI.getOpcode();
10360
10361 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10362 Register Dst = MI.getOperand(0).getReg();
10363 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10364 : MI.getOperand(1).getReg();
10365 LLT DstTy = MRI.getType(Dst);
10366 LLT SrcTy = MRI.getType(Src);
10367 unsigned DstAS = DstTy.getAddressSpace();
10368 unsigned SrcAS = SrcTy.getAddressSpace();
10369 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10370 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10371 ST.hasGloballyAddressableScratch()
10374 };
10375
10376 // If the target supports globally addressable scratch, the mapping from
10377 // scratch memory to the flat aperture changes therefore an address space cast
10378 // is no longer uniform.
10379 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10380 return HandleAddrSpaceCast(MI);
10381
10382 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10383 auto IID = GI->getIntrinsicID();
10388
10389 switch (IID) {
10390 case Intrinsic::amdgcn_addrspacecast_nonnull:
10391 return HandleAddrSpaceCast(MI);
10392 case Intrinsic::amdgcn_if:
10393 case Intrinsic::amdgcn_else:
10394 // FIXME: Uniform if second result
10395 break;
10396 }
10397
10399 }
10400
10401 // Loads from the private and flat address spaces are divergent, because
10402 // threads can execute the load instruction with the same inputs and get
10403 // different results.
10404 //
10405 // All other loads are not divergent, because if threads issue loads with the
10406 // same arguments, they will always get the same result.
10407 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10408 Opcode == AMDGPU::G_SEXTLOAD) {
10409 if (MI.memoperands_empty())
10410 return InstructionUniformity::NeverUniform; // conservative assumption
10411
10412 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10413 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10414 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10415 })) {
10416 // At least one MMO in a non-global address space.
10418 }
10420 }
10421
10422 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10423 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10424 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10425 AMDGPU::isGenericAtomic(Opcode)) {
10427 }
10429}
10430
10433
10434 if (isNeverUniform(MI))
10436
10437 unsigned opcode = MI.getOpcode();
10438 if (opcode == AMDGPU::V_READLANE_B32 ||
10439 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10440 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10442
10443 if (isCopyInstr(MI)) {
10444 const MachineOperand &srcOp = MI.getOperand(1);
10445 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10446 const TargetRegisterClass *regClass =
10447 RI.getPhysRegBaseClass(srcOp.getReg());
10448 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10450 }
10452 }
10453
10454 // GMIR handling
10455 if (MI.isPreISelOpcode())
10457
10458 // Atomics are divergent because they are executed sequentially: when an
10459 // atomic operation refers to the same address in each thread, then each
10460 // thread after the first sees the value written by the previous thread as
10461 // original value.
10462
10463 if (isAtomic(MI))
10465
10466 // Loads from the private and flat address spaces are divergent, because
10467 // threads can execute the load instruction with the same inputs and get
10468 // different results.
10469 if (isFLAT(MI) && MI.mayLoad()) {
10470 if (MI.memoperands_empty())
10471 return InstructionUniformity::NeverUniform; // conservative assumption
10472
10473 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10474 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10475 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10476 })) {
10477 // At least one MMO in a non-global address space.
10479 }
10480
10482 }
10483
10484 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
10485 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10486
10487 // FIXME: It's conceptually broken to report this for an instruction, and not
10488 // a specific def operand. For inline asm in particular, there could be mixed
10489 // uniform and divergent results.
10490 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10491 const MachineOperand &SrcOp = MI.getOperand(I);
10492 if (!SrcOp.isReg())
10493 continue;
10494
10495 Register Reg = SrcOp.getReg();
10496 if (!Reg || !SrcOp.readsReg())
10497 continue;
10498
10499 // If RegBank is null, this is unassigned or an unallocatable special
10500 // register, which are all scalars.
10501 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10502 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10504 }
10505
10506 // TODO: Uniformity check condtions above can be rearranged for more
10507 // redability
10508
10509 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10510 // currently turned into no-op COPYs by SelectionDAG ISel and are
10511 // therefore no longer recognizable.
10512
10514}
10515
10517 switch (MF.getFunction().getCallingConv()) {
10519 return 1;
10521 return 2;
10523 return 3;
10527 const Function &F = MF.getFunction();
10528 F.getContext().diagnose(DiagnosticInfoUnsupported(
10529 F, "ds_ordered_count unsupported for this calling conv"));
10530 [[fallthrough]];
10531 }
10534 case CallingConv::C:
10535 case CallingConv::Fast:
10536 default:
10537 // Assume other calling conventions are various compute callable functions
10538 return 0;
10539 }
10540}
10541
10543 Register &SrcReg2, int64_t &CmpMask,
10544 int64_t &CmpValue) const {
10545 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10546 return false;
10547
10548 switch (MI.getOpcode()) {
10549 default:
10550 break;
10551 case AMDGPU::S_CMP_EQ_U32:
10552 case AMDGPU::S_CMP_EQ_I32:
10553 case AMDGPU::S_CMP_LG_U32:
10554 case AMDGPU::S_CMP_LG_I32:
10555 case AMDGPU::S_CMP_LT_U32:
10556 case AMDGPU::S_CMP_LT_I32:
10557 case AMDGPU::S_CMP_GT_U32:
10558 case AMDGPU::S_CMP_GT_I32:
10559 case AMDGPU::S_CMP_LE_U32:
10560 case AMDGPU::S_CMP_LE_I32:
10561 case AMDGPU::S_CMP_GE_U32:
10562 case AMDGPU::S_CMP_GE_I32:
10563 case AMDGPU::S_CMP_EQ_U64:
10564 case AMDGPU::S_CMP_LG_U64:
10565 SrcReg = MI.getOperand(0).getReg();
10566 if (MI.getOperand(1).isReg()) {
10567 if (MI.getOperand(1).getSubReg())
10568 return false;
10569 SrcReg2 = MI.getOperand(1).getReg();
10570 CmpValue = 0;
10571 } else if (MI.getOperand(1).isImm()) {
10572 SrcReg2 = Register();
10573 CmpValue = MI.getOperand(1).getImm();
10574 } else {
10575 return false;
10576 }
10577 CmpMask = ~0;
10578 return true;
10579 case AMDGPU::S_CMPK_EQ_U32:
10580 case AMDGPU::S_CMPK_EQ_I32:
10581 case AMDGPU::S_CMPK_LG_U32:
10582 case AMDGPU::S_CMPK_LG_I32:
10583 case AMDGPU::S_CMPK_LT_U32:
10584 case AMDGPU::S_CMPK_LT_I32:
10585 case AMDGPU::S_CMPK_GT_U32:
10586 case AMDGPU::S_CMPK_GT_I32:
10587 case AMDGPU::S_CMPK_LE_U32:
10588 case AMDGPU::S_CMPK_LE_I32:
10589 case AMDGPU::S_CMPK_GE_U32:
10590 case AMDGPU::S_CMPK_GE_I32:
10591 SrcReg = MI.getOperand(0).getReg();
10592 SrcReg2 = Register();
10593 CmpValue = MI.getOperand(1).getImm();
10594 CmpMask = ~0;
10595 return true;
10596 }
10597
10598 return false;
10599}
10600
10602 Register SrcReg2, int64_t CmpMask,
10603 int64_t CmpValue,
10604 const MachineRegisterInfo *MRI) const {
10605 if (!SrcReg || SrcReg.isPhysical())
10606 return false;
10607
10608 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10609 return false;
10610
10611 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
10612 this](int64_t ExpectedValue, unsigned SrcSize,
10613 bool IsReversible, bool IsSigned) -> bool {
10614 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10615 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10616 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10617 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10618 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
10619 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10620 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10621 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10622 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10623 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
10624 //
10625 // Signed ge/gt are not used for the sign bit.
10626 //
10627 // If result of the AND is unused except in the compare:
10628 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
10629 //
10630 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10631 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10632 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
10633 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10634 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10635 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
10636
10637 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
10638 if (!Def || Def->getParent() != CmpInstr.getParent())
10639 return false;
10640
10641 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
10642 Def->getOpcode() != AMDGPU::S_AND_B64)
10643 return false;
10644
10645 int64_t Mask;
10646 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
10647 if (MO->isImm())
10648 Mask = MO->getImm();
10649 else if (!getFoldableImm(MO, Mask))
10650 return false;
10651 Mask &= maxUIntN(SrcSize);
10652 return isPowerOf2_64(Mask);
10653 };
10654
10655 MachineOperand *SrcOp = &Def->getOperand(1);
10656 if (isMask(SrcOp))
10657 SrcOp = &Def->getOperand(2);
10658 else if (isMask(&Def->getOperand(2)))
10659 SrcOp = &Def->getOperand(1);
10660 else
10661 return false;
10662
10663 // A valid Mask is required to have a single bit set, hence a non-zero and
10664 // power-of-two value. This verifies that we will not do 64-bit shift below.
10665 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
10666 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
10667 if (IsSigned && BitNo == SrcSize - 1)
10668 return false;
10669
10670 ExpectedValue <<= BitNo;
10671
10672 bool IsReversedCC = false;
10673 if (CmpValue != ExpectedValue) {
10674 if (!IsReversible)
10675 return false;
10676 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
10677 if (!IsReversedCC)
10678 return false;
10679 }
10680
10681 Register DefReg = Def->getOperand(0).getReg();
10682 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
10683 return false;
10684
10685 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
10686 I != E; ++I) {
10687 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
10688 I->killsRegister(AMDGPU::SCC, &RI))
10689 return false;
10690 }
10691
10692 MachineOperand *SccDef =
10693 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
10694 SccDef->setIsDead(false);
10695 CmpInstr.eraseFromParent();
10696
10697 if (!MRI->use_nodbg_empty(DefReg)) {
10698 assert(!IsReversedCC);
10699 return true;
10700 }
10701
10702 // Replace AND with unused result with a S_BITCMP.
10703 MachineBasicBlock *MBB = Def->getParent();
10704
10705 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
10706 : AMDGPU::S_BITCMP1_B32
10707 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
10708 : AMDGPU::S_BITCMP1_B64;
10709
10710 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
10711 .add(*SrcOp)
10712 .addImm(BitNo);
10713 Def->eraseFromParent();
10714
10715 return true;
10716 };
10717
10718 switch (CmpInstr.getOpcode()) {
10719 default:
10720 break;
10721 case AMDGPU::S_CMP_EQ_U32:
10722 case AMDGPU::S_CMP_EQ_I32:
10723 case AMDGPU::S_CMPK_EQ_U32:
10724 case AMDGPU::S_CMPK_EQ_I32:
10725 return optimizeCmpAnd(1, 32, true, false);
10726 case AMDGPU::S_CMP_GE_U32:
10727 case AMDGPU::S_CMPK_GE_U32:
10728 return optimizeCmpAnd(1, 32, false, false);
10729 case AMDGPU::S_CMP_GE_I32:
10730 case AMDGPU::S_CMPK_GE_I32:
10731 return optimizeCmpAnd(1, 32, false, true);
10732 case AMDGPU::S_CMP_EQ_U64:
10733 return optimizeCmpAnd(1, 64, true, false);
10734 case AMDGPU::S_CMP_LG_U32:
10735 case AMDGPU::S_CMP_LG_I32:
10736 case AMDGPU::S_CMPK_LG_U32:
10737 case AMDGPU::S_CMPK_LG_I32:
10738 return optimizeCmpAnd(0, 32, true, false);
10739 case AMDGPU::S_CMP_GT_U32:
10740 case AMDGPU::S_CMPK_GT_U32:
10741 return optimizeCmpAnd(0, 32, false, false);
10742 case AMDGPU::S_CMP_GT_I32:
10743 case AMDGPU::S_CMPK_GT_I32:
10744 return optimizeCmpAnd(0, 32, false, true);
10745 case AMDGPU::S_CMP_LG_U64:
10746 return optimizeCmpAnd(0, 64, true, false);
10747 }
10748
10749 return false;
10750}
10751
10753 AMDGPU::OpName OpName) const {
10754 if (!ST.needsAlignedVGPRs())
10755 return;
10756
10757 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
10758 if (OpNo < 0)
10759 return;
10760 MachineOperand &Op = MI.getOperand(OpNo);
10761 if (getOpSize(MI, OpNo) > 4)
10762 return;
10763
10764 // Add implicit aligned super-reg to force alignment on the data operand.
10765 const DebugLoc &DL = MI.getDebugLoc();
10766 MachineBasicBlock *BB = MI.getParent();
10768 Register DataReg = Op.getReg();
10769 bool IsAGPR = RI.isAGPR(MRI, DataReg);
10770 Register Undef = MRI.createVirtualRegister(
10771 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10772 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
10773 Register NewVR =
10774 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10775 : &AMDGPU::VReg_64_Align2RegClass);
10776 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
10777 .addReg(DataReg, 0, Op.getSubReg())
10778 .addImm(AMDGPU::sub0)
10779 .addReg(Undef)
10780 .addImm(AMDGPU::sub1);
10781 Op.setReg(NewVR);
10782 Op.setSubReg(AMDGPU::sub0);
10783 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
10784}
10785
10787 if (isIGLP(*MI))
10788 return false;
10789
10791}
10792
10794 if (!isWMMA(MI) && !isSWMMAC(MI))
10795 return false;
10796
10797 if (AMDGPU::isGFX1250(ST))
10798 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
10799
10800 return true;
10801}
10802
10804 unsigned Opcode = MI.getOpcode();
10805
10806 if (AMDGPU::isGFX12Plus(ST))
10807 return isDOT(MI) || isXDLWMMA(MI);
10808
10809 if (!isMAI(MI) || isDGEMM(Opcode) ||
10810 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
10811 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
10812 return false;
10813
10814 if (!ST.hasGFX940Insts())
10815 return true;
10816
10817 return AMDGPU::getMAIIsGFX940XDL(Opcode);
10818}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MCInstrDesc &TID, unsigned RCID)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition ArrayRef.h:150
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasAddNoCarry() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
bool mayStore() const
Return true if this instruction could possibly modify memory.
bool mayLoad() const
Return true if this instruction could possibly read memory.
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:87
uint8_t OperandType
Information about the type of the operand.
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:96
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:102
constexpr bool isValid() const
Definition Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
bool isSpill(uint16_t Opcode) const
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI) const override
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
static bool usesLGKM_CNT(const MachineInstr &MI)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST)
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:231
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:202
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:206
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:201
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:216
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:205
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:212
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:248
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:204
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:242
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:213
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:220
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:203
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:228
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:569
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:571
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:568
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:570
@ TI_CONSTDATA_START
Definition AMDGPU.h:567
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:72
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:70
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:71
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:62
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:73
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:477
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:216
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
Op::Description Desc
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:257
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:127
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:583
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
MachineCycleInfo::CycleT MachineCycle
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:219
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:56
MachineInstr * top() const
Definition SIInstrInfo.h:61
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:80
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.