LLVM 22.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
21#include "llvm/ADT/STLExtras.h"
32#include "llvm/IR/IntrinsicsAMDGPU.h"
33#include "llvm/MC/MCContext.h"
36
37using namespace llvm;
38
39#define DEBUG_TYPE "si-instr-info"
40
41#define GET_INSTRINFO_CTOR_DTOR
42#include "AMDGPUGenInstrInfo.inc"
43
44namespace llvm::AMDGPU {
45#define GET_D16ImageDimIntrinsics_IMPL
46#define GET_ImageDimIntrinsicTable_IMPL
47#define GET_RsrcIntrinsics_IMPL
48#include "AMDGPUGenSearchableTables.inc"
49} // namespace llvm::AMDGPU
50
51// Must be at least 4 to be able to branch over minimum unconditional branch
52// code. This is only for making it possible to write reasonably small tests for
53// long branches.
55BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
56 cl::desc("Restrict range of branch instructions (DEBUG)"));
57
59 "amdgpu-fix-16-bit-physreg-copies",
60 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
61 cl::init(true),
63
65 : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
66 RI(ST), ST(ST) {
67 SchedModel.init(&ST);
68}
69
70//===----------------------------------------------------------------------===//
71// TargetInstrInfo callbacks
72//===----------------------------------------------------------------------===//
73
74static unsigned getNumOperandsNoGlue(SDNode *Node) {
75 unsigned N = Node->getNumOperands();
76 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
77 --N;
78 return N;
79}
80
81/// Returns true if both nodes have the same value for the given
82/// operand \p Op, or if both nodes do not have this operand.
84 AMDGPU::OpName OpName) {
85 unsigned Opc0 = N0->getMachineOpcode();
86 unsigned Opc1 = N1->getMachineOpcode();
87
88 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
89 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
90
91 if (Op0Idx == -1 && Op1Idx == -1)
92 return true;
93
94
95 if ((Op0Idx == -1 && Op1Idx != -1) ||
96 (Op1Idx == -1 && Op0Idx != -1))
97 return false;
98
99 // getNamedOperandIdx returns the index for the MachineInstr's operands,
100 // which includes the result as the first operand. We are indexing into the
101 // MachineSDNode's operands, so we need to skip the result operand to get
102 // the real index.
103 --Op0Idx;
104 --Op1Idx;
105
106 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
107}
108
109static bool canRemat(const MachineInstr &MI) {
110
114 return true;
115
116 if (SIInstrInfo::isSMRD(MI)) {
117 return !MI.memoperands_empty() &&
118 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
119 return MMO->isLoad() && MMO->isInvariant();
120 });
121 }
122
123 return false;
124}
125
127 const MachineInstr &MI) const {
128
129 if (canRemat(MI)) {
130 // Normally VALU use of exec would block the rematerialization, but that
131 // is OK in this case to have an implicit exec read as all VALU do.
132 // We really want all of the generic logic for this except for this.
133
134 // Another potential implicit use is mode register. The core logic of
135 // the RA will not attempt rematerialization if mode is set anywhere
136 // in the function, otherwise it is safe since mode is not changed.
137
138 // There is difference to generic method which does not allow
139 // rematerialization if there are virtual register uses. We allow this,
140 // therefore this method includes SOP instructions as well.
141 if (!MI.hasImplicitDef() &&
142 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
143 !MI.mayRaiseFPException())
144 return true;
145 }
146
148}
149
150// Returns true if the scalar result of a VALU instruction depends on exec.
151bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
152 // Ignore comparisons which are only used masked with exec.
153 // This allows some hoisting/sinking of VALU comparisons.
154 if (MI.isCompare()) {
155 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
156 if (!Dst)
157 return true;
158
159 Register DstReg = Dst->getReg();
160 if (!DstReg.isVirtual())
161 return true;
162
163 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
164 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
165 switch (Use.getOpcode()) {
166 case AMDGPU::S_AND_SAVEEXEC_B32:
167 case AMDGPU::S_AND_SAVEEXEC_B64:
168 break;
169 case AMDGPU::S_AND_B32:
170 case AMDGPU::S_AND_B64:
171 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
172 return true;
173 break;
174 default:
175 return true;
176 }
177 }
178 return false;
179 }
180
181 switch (MI.getOpcode()) {
182 default:
183 break;
184 case AMDGPU::V_READFIRSTLANE_B32:
185 return true;
186 }
187
188 return false;
189}
190
192 // Any implicit use of exec by VALU is not a real register read.
193 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
194 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
195}
196
198 MachineBasicBlock *SuccToSinkTo,
199 MachineCycleInfo *CI) const {
200 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
201 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
202 return true;
203
204 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
205 // Check if sinking of MI would create temporal divergent use.
206 for (auto Op : MI.uses()) {
207 if (Op.isReg() && Op.getReg().isVirtual() &&
208 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
209 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
210
211 // SgprDef defined inside cycle
212 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
213 if (FromCycle == nullptr)
214 continue;
215
216 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
217 // Check if there is a FromCycle that contains SgprDef's basic block but
218 // does not contain SuccToSinkTo and also has divergent exit condition.
219 while (FromCycle && !FromCycle->contains(ToCycle)) {
221 FromCycle->getExitingBlocks(ExitingBlocks);
222
223 // FromCycle has divergent exit condition.
224 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
225 if (hasDivergentBranch(ExitingBlock))
226 return false;
227 }
228
229 FromCycle = FromCycle->getParentCycle();
230 }
231 }
232 }
233
234 return true;
235}
236
238 int64_t &Offset0,
239 int64_t &Offset1) const {
240 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
241 return false;
242
243 unsigned Opc0 = Load0->getMachineOpcode();
244 unsigned Opc1 = Load1->getMachineOpcode();
245
246 // Make sure both are actually loads.
247 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
248 return false;
249
250 // A mayLoad instruction without a def is not a load. Likely a prefetch.
251 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
252 return false;
253
254 if (isDS(Opc0) && isDS(Opc1)) {
255
256 // FIXME: Handle this case:
257 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
258 return false;
259
260 // Check base reg.
261 if (Load0->getOperand(0) != Load1->getOperand(0))
262 return false;
263
264 // Skip read2 / write2 variants for simplicity.
265 // TODO: We should report true if the used offsets are adjacent (excluded
266 // st64 versions).
267 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
268 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
269 if (Offset0Idx == -1 || Offset1Idx == -1)
270 return false;
271
272 // XXX - be careful of dataless loads
273 // getNamedOperandIdx returns the index for MachineInstrs. Since they
274 // include the output in the operand list, but SDNodes don't, we need to
275 // subtract the index by one.
276 Offset0Idx -= get(Opc0).NumDefs;
277 Offset1Idx -= get(Opc1).NumDefs;
278 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
279 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
280 return true;
281 }
282
283 if (isSMRD(Opc0) && isSMRD(Opc1)) {
284 // Skip time and cache invalidation instructions.
285 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
286 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
287 return false;
288
289 unsigned NumOps = getNumOperandsNoGlue(Load0);
290 if (NumOps != getNumOperandsNoGlue(Load1))
291 return false;
292
293 // Check base reg.
294 if (Load0->getOperand(0) != Load1->getOperand(0))
295 return false;
296
297 // Match register offsets, if both register and immediate offsets present.
298 assert(NumOps == 4 || NumOps == 5);
299 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
300 return false;
301
302 const ConstantSDNode *Load0Offset =
304 const ConstantSDNode *Load1Offset =
306
307 if (!Load0Offset || !Load1Offset)
308 return false;
309
310 Offset0 = Load0Offset->getZExtValue();
311 Offset1 = Load1Offset->getZExtValue();
312 return true;
313 }
314
315 // MUBUF and MTBUF can access the same addresses.
316 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
317
318 // MUBUF and MTBUF have vaddr at different indices.
319 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
320 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
321 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
322 return false;
323
324 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
325 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
326
327 if (OffIdx0 == -1 || OffIdx1 == -1)
328 return false;
329
330 // getNamedOperandIdx returns the index for MachineInstrs. Since they
331 // include the output in the operand list, but SDNodes don't, we need to
332 // subtract the index by one.
333 OffIdx0 -= get(Opc0).NumDefs;
334 OffIdx1 -= get(Opc1).NumDefs;
335
336 SDValue Off0 = Load0->getOperand(OffIdx0);
337 SDValue Off1 = Load1->getOperand(OffIdx1);
338
339 // The offset might be a FrameIndexSDNode.
340 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
341 return false;
342
343 Offset0 = Off0->getAsZExtVal();
344 Offset1 = Off1->getAsZExtVal();
345 return true;
346 }
347
348 return false;
349}
350
351static bool isStride64(unsigned Opc) {
352 switch (Opc) {
353 case AMDGPU::DS_READ2ST64_B32:
354 case AMDGPU::DS_READ2ST64_B64:
355 case AMDGPU::DS_WRITE2ST64_B32:
356 case AMDGPU::DS_WRITE2ST64_B64:
357 return true;
358 default:
359 return false;
360 }
361}
362
365 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
366 const TargetRegisterInfo *TRI) const {
367 if (!LdSt.mayLoadOrStore())
368 return false;
369
370 unsigned Opc = LdSt.getOpcode();
371 OffsetIsScalable = false;
372 const MachineOperand *BaseOp, *OffsetOp;
373 int DataOpIdx;
374
375 if (isDS(LdSt)) {
376 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
377 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
378 if (OffsetOp) {
379 // Normal, single offset LDS instruction.
380 if (!BaseOp) {
381 // DS_CONSUME/DS_APPEND use M0 for the base address.
382 // TODO: find the implicit use operand for M0 and use that as BaseOp?
383 return false;
384 }
385 BaseOps.push_back(BaseOp);
386 Offset = OffsetOp->getImm();
387 // Get appropriate operand, and compute width accordingly.
388 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
389 if (DataOpIdx == -1)
390 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
391 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
392 Width = LocationSize::precise(64);
393 else
394 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
395 } else {
396 // The 2 offset instructions use offset0 and offset1 instead. We can treat
397 // these as a load with a single offset if the 2 offsets are consecutive.
398 // We will use this for some partially aligned loads.
399 const MachineOperand *Offset0Op =
400 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
401 const MachineOperand *Offset1Op =
402 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
403
404 unsigned Offset0 = Offset0Op->getImm() & 0xff;
405 unsigned Offset1 = Offset1Op->getImm() & 0xff;
406 if (Offset0 + 1 != Offset1)
407 return false;
408
409 // Each of these offsets is in element sized units, so we need to convert
410 // to bytes of the individual reads.
411
412 unsigned EltSize;
413 if (LdSt.mayLoad())
414 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
415 else {
416 assert(LdSt.mayStore());
417 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
418 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
419 }
420
421 if (isStride64(Opc))
422 EltSize *= 64;
423
424 BaseOps.push_back(BaseOp);
425 Offset = EltSize * Offset0;
426 // Get appropriate operand(s), and compute width accordingly.
427 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
428 if (DataOpIdx == -1) {
429 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
430 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
431 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
432 Width = LocationSize::precise(
433 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
434 } else {
435 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
436 }
437 }
438 return true;
439 }
440
441 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
442 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
443 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
444 return false;
445 BaseOps.push_back(RSrc);
446 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
447 if (BaseOp && !BaseOp->isFI())
448 BaseOps.push_back(BaseOp);
449 const MachineOperand *OffsetImm =
450 getNamedOperand(LdSt, AMDGPU::OpName::offset);
451 Offset = OffsetImm->getImm();
452 const MachineOperand *SOffset =
453 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
454 if (SOffset) {
455 if (SOffset->isReg())
456 BaseOps.push_back(SOffset);
457 else
458 Offset += SOffset->getImm();
459 }
460 // Get appropriate operand, and compute width accordingly.
461 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
462 if (DataOpIdx == -1)
463 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
464 if (DataOpIdx == -1) // LDS DMA
465 return false;
466 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
467 return true;
468 }
469
470 if (isImage(LdSt)) {
471 auto RsrcOpName =
472 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
473 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
474 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
475 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
476 if (VAddr0Idx >= 0) {
477 // GFX10 possible NSA encoding.
478 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
479 BaseOps.push_back(&LdSt.getOperand(I));
480 } else {
481 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
482 }
483 Offset = 0;
484 // Get appropriate operand, and compute width accordingly.
485 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
486 if (DataOpIdx == -1)
487 return false; // no return sampler
488 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
489 return true;
490 }
491
492 if (isSMRD(LdSt)) {
493 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
494 if (!BaseOp) // e.g. S_MEMTIME
495 return false;
496 BaseOps.push_back(BaseOp);
497 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
498 Offset = OffsetOp ? OffsetOp->getImm() : 0;
499 // Get appropriate operand, and compute width accordingly.
500 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
501 if (DataOpIdx == -1)
502 return false;
503 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
504 return true;
505 }
506
507 if (isFLAT(LdSt)) {
508 // Instructions have either vaddr or saddr or both or none.
509 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
510 if (BaseOp)
511 BaseOps.push_back(BaseOp);
512 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
513 if (BaseOp)
514 BaseOps.push_back(BaseOp);
515 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
516 // Get appropriate operand, and compute width accordingly.
517 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
518 if (DataOpIdx == -1)
519 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
520 if (DataOpIdx == -1) // LDS DMA
521 return false;
522 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
523 return true;
524 }
525
526 return false;
527}
528
529static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
531 const MachineInstr &MI2,
533 // Only examine the first "base" operand of each instruction, on the
534 // assumption that it represents the real base address of the memory access.
535 // Other operands are typically offsets or indices from this base address.
536 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
537 return true;
538
539 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
540 return false;
541
542 auto *MO1 = *MI1.memoperands_begin();
543 auto *MO2 = *MI2.memoperands_begin();
544 if (MO1->getAddrSpace() != MO2->getAddrSpace())
545 return false;
546
547 const auto *Base1 = MO1->getValue();
548 const auto *Base2 = MO2->getValue();
549 if (!Base1 || !Base2)
550 return false;
551 Base1 = getUnderlyingObject(Base1);
552 Base2 = getUnderlyingObject(Base2);
553
554 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
555 return false;
556
557 return Base1 == Base2;
558}
559
561 int64_t Offset1, bool OffsetIsScalable1,
563 int64_t Offset2, bool OffsetIsScalable2,
564 unsigned ClusterSize,
565 unsigned NumBytes) const {
566 // If the mem ops (to be clustered) do not have the same base ptr, then they
567 // should not be clustered
568 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
569 if (!BaseOps1.empty() && !BaseOps2.empty()) {
570 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
571 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
572 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
573 return false;
574
575 const SIMachineFunctionInfo *MFI =
576 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
577 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
578 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
579 // If only one base op is empty, they do not have the same base ptr
580 return false;
581 }
582
583 // In order to avoid register pressure, on an average, the number of DWORDS
584 // loaded together by all clustered mem ops should not exceed
585 // MaxMemoryClusterDWords. This is an empirical value based on certain
586 // observations and performance related experiments.
587 // The good thing about this heuristic is - it avoids clustering of too many
588 // sub-word loads, and also avoids clustering of wide loads. Below is the
589 // brief summary of how the heuristic behaves for various `LoadSize` when
590 // MaxMemoryClusterDWords is 8.
591 //
592 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
593 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
594 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
595 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
596 // (5) LoadSize >= 17: do not cluster
597 const unsigned LoadSize = NumBytes / ClusterSize;
598 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
599 return NumDWords <= MaxMemoryClusterDWords;
600}
601
602// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
603// the first 16 loads will be interleaved with the stores, and the next 16 will
604// be clustered as expected. It should really split into 2 16 store batches.
605//
606// Loads are clustered until this returns false, rather than trying to schedule
607// groups of stores. This also means we have to deal with saying different
608// address space loads should be clustered, and ones which might cause bank
609// conflicts.
610//
611// This might be deprecated so it might not be worth that much effort to fix.
613 int64_t Offset0, int64_t Offset1,
614 unsigned NumLoads) const {
615 assert(Offset1 > Offset0 &&
616 "Second offset should be larger than first offset!");
617 // If we have less than 16 loads in a row, and the offsets are within 64
618 // bytes, then schedule together.
619
620 // A cacheline is 64 bytes (for global memory).
621 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
622}
623
626 const DebugLoc &DL, MCRegister DestReg,
627 MCRegister SrcReg, bool KillSrc,
628 const char *Msg = "illegal VGPR to SGPR copy") {
629 MachineFunction *MF = MBB.getParent();
630
632 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
633
634 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
635 .addReg(SrcReg, getKillRegState(KillSrc));
636}
637
638/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
639/// possible to have a direct copy in these cases on GFX908, so an intermediate
640/// VGPR copy is required.
644 const DebugLoc &DL, MCRegister DestReg,
645 MCRegister SrcReg, bool KillSrc,
646 RegScavenger &RS, bool RegsOverlap,
647 Register ImpDefSuperReg = Register(),
648 Register ImpUseSuperReg = Register()) {
649 assert((TII.getSubtarget().hasMAIInsts() &&
650 !TII.getSubtarget().hasGFX90AInsts()) &&
651 "Expected GFX908 subtarget.");
652
653 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
654 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
655 "Source register of the copy should be either an SGPR or an AGPR.");
656
657 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
658 "Destination register of the copy should be an AGPR.");
659
660 const SIRegisterInfo &RI = TII.getRegisterInfo();
661
662 // First try to find defining accvgpr_write to avoid temporary registers.
663 // In the case of copies of overlapping AGPRs, we conservatively do not
664 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
665 // an accvgpr_write used for this same copy due to implicit-defs
666 if (!RegsOverlap) {
667 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
668 --Def;
669
670 if (!Def->modifiesRegister(SrcReg, &RI))
671 continue;
672
673 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
674 Def->getOperand(0).getReg() != SrcReg)
675 break;
676
677 MachineOperand &DefOp = Def->getOperand(1);
678 assert(DefOp.isReg() || DefOp.isImm());
679
680 if (DefOp.isReg()) {
681 bool SafeToPropagate = true;
682 // Check that register source operand is not clobbered before MI.
683 // Immediate operands are always safe to propagate.
684 for (auto I = Def; I != MI && SafeToPropagate; ++I)
685 if (I->modifiesRegister(DefOp.getReg(), &RI))
686 SafeToPropagate = false;
687
688 if (!SafeToPropagate)
689 break;
690
691 for (auto I = Def; I != MI; ++I)
692 I->clearRegisterKills(DefOp.getReg(), &RI);
693 }
694
695 MachineInstrBuilder Builder =
696 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
697 .add(DefOp);
698 if (ImpDefSuperReg)
699 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
700
701 if (ImpUseSuperReg) {
702 Builder.addReg(ImpUseSuperReg,
704 }
705
706 return;
707 }
708 }
709
711 RS.backward(std::next(MI));
712
713 // Ideally we want to have three registers for a long reg_sequence copy
714 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
715 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
716 *MBB.getParent());
717
718 // Registers in the sequence are allocated contiguously so we can just
719 // use register number to pick one of three round-robin temps.
720 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
721 Register Tmp =
722 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
723 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
724 "VGPR used for an intermediate copy should have been reserved.");
725
726 // Only loop through if there are any free registers left. We don't want to
727 // spill.
728 while (RegNo--) {
729 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
730 /* RestoreAfter */ false, 0,
731 /* AllowSpill */ false);
732 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
733 break;
734 Tmp = Tmp2;
735 RS.setRegUsed(Tmp);
736 }
737
738 // Insert copy to temporary VGPR.
739 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
740 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
741 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
742 } else {
743 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
744 }
745
746 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
747 .addReg(SrcReg, getKillRegState(KillSrc));
748 if (ImpUseSuperReg) {
749 UseBuilder.addReg(ImpUseSuperReg,
751 }
752
753 MachineInstrBuilder DefBuilder
754 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
755 .addReg(Tmp, RegState::Kill);
756
757 if (ImpDefSuperReg)
758 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
759}
760
763 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
764 const TargetRegisterClass *RC, bool Forward) {
765 const SIRegisterInfo &RI = TII.getRegisterInfo();
766 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
768 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
769
770 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
771 int16_t SubIdx = BaseIndices[Idx];
772 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
773 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
774 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
775 unsigned Opcode = AMDGPU::S_MOV_B32;
776
777 // Is SGPR aligned? If so try to combine with next.
778 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
779 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
780 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
781 // Can use SGPR64 copy
782 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
783 SubIdx = RI.getSubRegFromChannel(Channel, 2);
784 DestSubReg = RI.getSubReg(DestReg, SubIdx);
785 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
786 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
787 Opcode = AMDGPU::S_MOV_B64;
788 Idx++;
789 }
790
791 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
792 .addReg(SrcSubReg)
793 .addReg(SrcReg, RegState::Implicit);
794
795 if (!FirstMI)
796 FirstMI = LastMI;
797
798 if (!Forward)
799 I--;
800 }
801
802 assert(FirstMI && LastMI);
803 if (!Forward)
804 std::swap(FirstMI, LastMI);
805
806 FirstMI->addOperand(
807 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
808
809 if (KillSrc)
810 LastMI->addRegisterKilled(SrcReg, &RI);
811}
812
815 const DebugLoc &DL, Register DestReg,
816 Register SrcReg, bool KillSrc, bool RenamableDest,
817 bool RenamableSrc) const {
818 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
819 unsigned Size = RI.getRegSizeInBits(*RC);
820 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
821 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
822
823 // The rest of copyPhysReg assumes Src and Dst size are the same size.
824 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
825 // we remove Fix16BitCopies and this code block?
826 if (Fix16BitCopies) {
827 if (((Size == 16) != (SrcSize == 16))) {
828 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
829 assert(ST.useRealTrue16Insts());
830 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
831 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
832 RegToFix = SubReg;
833
834 if (DestReg == SrcReg) {
835 // Identity copy. Insert empty bundle since ExpandPostRA expects an
836 // instruction here.
837 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
838 return;
839 }
840 RC = RI.getPhysRegBaseClass(DestReg);
841 Size = RI.getRegSizeInBits(*RC);
842 SrcRC = RI.getPhysRegBaseClass(SrcReg);
843 SrcSize = RI.getRegSizeInBits(*SrcRC);
844 }
845 }
846
847 if (RC == &AMDGPU::VGPR_32RegClass) {
848 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
849 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
850 AMDGPU::AGPR_32RegClass.contains(SrcReg));
851 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
852 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
853 BuildMI(MBB, MI, DL, get(Opc), DestReg)
854 .addReg(SrcReg, getKillRegState(KillSrc));
855 return;
856 }
857
858 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
859 RC == &AMDGPU::SReg_32RegClass) {
860 if (SrcReg == AMDGPU::SCC) {
861 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
862 .addImm(1)
863 .addImm(0);
864 return;
865 }
866
867 if (DestReg == AMDGPU::VCC_LO) {
868 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
869 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
870 .addReg(SrcReg, getKillRegState(KillSrc));
871 } else {
872 // FIXME: Hack until VReg_1 removed.
873 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
874 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
875 .addImm(0)
876 .addReg(SrcReg, getKillRegState(KillSrc));
877 }
878
879 return;
880 }
881
882 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
883 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
884 return;
885 }
886
887 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
888 .addReg(SrcReg, getKillRegState(KillSrc));
889 return;
890 }
891
892 if (RC == &AMDGPU::SReg_64RegClass) {
893 if (SrcReg == AMDGPU::SCC) {
894 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
895 .addImm(1)
896 .addImm(0);
897 return;
898 }
899
900 if (DestReg == AMDGPU::VCC) {
901 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
902 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
903 .addReg(SrcReg, getKillRegState(KillSrc));
904 } else {
905 // FIXME: Hack until VReg_1 removed.
906 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
907 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
908 .addImm(0)
909 .addReg(SrcReg, getKillRegState(KillSrc));
910 }
911
912 return;
913 }
914
915 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
916 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
917 return;
918 }
919
920 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
921 .addReg(SrcReg, getKillRegState(KillSrc));
922 return;
923 }
924
925 if (DestReg == AMDGPU::SCC) {
926 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
927 // but SelectionDAG emits such copies for i1 sources.
928 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
929 // This copy can only be produced by patterns
930 // with explicit SCC, which are known to be enabled
931 // only for subtargets with S_CMP_LG_U64 present.
932 assert(ST.hasScalarCompareEq64());
933 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
934 .addReg(SrcReg, getKillRegState(KillSrc))
935 .addImm(0);
936 } else {
937 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
938 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
939 .addReg(SrcReg, getKillRegState(KillSrc))
940 .addImm(0);
941 }
942
943 return;
944 }
945
946 if (RC == &AMDGPU::AGPR_32RegClass) {
947 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
948 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
949 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
950 .addReg(SrcReg, getKillRegState(KillSrc));
951 return;
952 }
953
954 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
955 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
956 .addReg(SrcReg, getKillRegState(KillSrc));
957 return;
958 }
959
960 // FIXME: Pass should maintain scavenger to avoid scan through the block on
961 // every AGPR spill.
962 RegScavenger RS;
963 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
964 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
965 return;
966 }
967
968 if (Size == 16) {
969 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
970 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
971 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
972
973 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
974 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
975 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
976 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
977 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
978 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
979 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
980 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
981
982 if (IsSGPRDst) {
983 if (!IsSGPRSrc) {
984 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
985 return;
986 }
987
988 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
989 .addReg(NewSrcReg, getKillRegState(KillSrc));
990 return;
991 }
992
993 if (IsAGPRDst || IsAGPRSrc) {
994 if (!DstLow || !SrcLow) {
995 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
996 "Cannot use hi16 subreg with an AGPR!");
997 }
998
999 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
1000 return;
1001 }
1002
1003 if (ST.useRealTrue16Insts()) {
1004 if (IsSGPRSrc) {
1005 assert(SrcLow);
1006 SrcReg = NewSrcReg;
1007 }
1008 // Use the smaller instruction encoding if possible.
1009 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1010 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1011 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1012 .addReg(SrcReg);
1013 } else {
1014 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1015 .addImm(0) // src0_modifiers
1016 .addReg(SrcReg)
1017 .addImm(0); // op_sel
1018 }
1019 return;
1020 }
1021
1022 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1023 if (!DstLow || !SrcLow) {
1024 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1025 "Cannot use hi16 subreg on VI!");
1026 }
1027
1028 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1029 .addReg(NewSrcReg, getKillRegState(KillSrc));
1030 return;
1031 }
1032
1033 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1034 .addImm(0) // src0_modifiers
1035 .addReg(NewSrcReg)
1036 .addImm(0) // clamp
1043 // First implicit operand is $exec.
1044 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1045 return;
1046 }
1047
1048 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1049 if (ST.hasMovB64()) {
1050 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1051 .addReg(SrcReg, getKillRegState(KillSrc));
1052 return;
1053 }
1054 if (ST.hasPkMovB32()) {
1055 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1057 .addReg(SrcReg)
1059 .addReg(SrcReg)
1060 .addImm(0) // op_sel_lo
1061 .addImm(0) // op_sel_hi
1062 .addImm(0) // neg_lo
1063 .addImm(0) // neg_hi
1064 .addImm(0) // clamp
1065 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1066 return;
1067 }
1068 }
1069
1070 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1071 if (RI.isSGPRClass(RC)) {
1072 if (!RI.isSGPRClass(SrcRC)) {
1073 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1074 return;
1075 }
1076 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1077 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1078 Forward);
1079 return;
1080 }
1081
1082 unsigned EltSize = 4;
1083 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1084 if (RI.isAGPRClass(RC)) {
1085 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1086 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1087 else if (RI.hasVGPRs(SrcRC) ||
1088 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1089 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1090 else
1091 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1092 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1093 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1094 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1095 (RI.isProperlyAlignedRC(*RC) &&
1096 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1097 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1098 if (ST.hasMovB64()) {
1099 Opcode = AMDGPU::V_MOV_B64_e32;
1100 EltSize = 8;
1101 } else if (ST.hasPkMovB32()) {
1102 Opcode = AMDGPU::V_PK_MOV_B32;
1103 EltSize = 8;
1104 }
1105 }
1106
1107 // For the cases where we need an intermediate instruction/temporary register
1108 // (destination is an AGPR), we need a scavenger.
1109 //
1110 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1111 // whole block for every handled copy.
1112 std::unique_ptr<RegScavenger> RS;
1113 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1114 RS = std::make_unique<RegScavenger>();
1115
1116 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1117
1118 // If there is an overlap, we can't kill the super-register on the last
1119 // instruction, since it will also kill the components made live by this def.
1120 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1121 const bool CanKillSuperReg = KillSrc && !Overlap;
1122
1123 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1124 unsigned SubIdx;
1125 if (Forward)
1126 SubIdx = SubIndices[Idx];
1127 else
1128 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1129 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1130 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1131 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1132
1133 bool IsFirstSubreg = Idx == 0;
1134 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1135
1136 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1137 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1138 Register ImpUseSuper = SrcReg;
1139 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1140 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1141 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1143 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1145 .addReg(SrcSubReg)
1147 .addReg(SrcSubReg)
1148 .addImm(0) // op_sel_lo
1149 .addImm(0) // op_sel_hi
1150 .addImm(0) // neg_lo
1151 .addImm(0) // neg_hi
1152 .addImm(0) // clamp
1153 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1154 if (IsFirstSubreg)
1156 } else {
1157 MachineInstrBuilder Builder =
1158 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1159 if (IsFirstSubreg)
1160 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1161
1162 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1163 }
1164 }
1165}
1166
1167int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1168 int NewOpc;
1169
1170 // Try to map original to commuted opcode
1171 NewOpc = AMDGPU::getCommuteRev(Opcode);
1172 if (NewOpc != -1)
1173 // Check if the commuted (REV) opcode exists on the target.
1174 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1175
1176 // Try to map commuted to original opcode
1177 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1178 if (NewOpc != -1)
1179 // Check if the original (non-REV) opcode exists on the target.
1180 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1181
1182 return Opcode;
1183}
1184
1185const TargetRegisterClass *
1187 return &AMDGPU::VGPR_32RegClass;
1188}
1189
1192 const DebugLoc &DL, Register DstReg,
1194 Register TrueReg,
1195 Register FalseReg) const {
1196 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1197 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1198 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1199 "Not a VGPR32 reg");
1200
1201 if (Cond.size() == 1) {
1202 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1203 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1204 .add(Cond[0]);
1205 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1206 .addImm(0)
1207 .addReg(FalseReg)
1208 .addImm(0)
1209 .addReg(TrueReg)
1210 .addReg(SReg);
1211 } else if (Cond.size() == 2) {
1212 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1213 switch (Cond[0].getImm()) {
1214 case SIInstrInfo::SCC_TRUE: {
1215 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1216 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1217 : AMDGPU::S_CSELECT_B64), SReg)
1218 .addImm(1)
1219 .addImm(0);
1220 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1221 .addImm(0)
1222 .addReg(FalseReg)
1223 .addImm(0)
1224 .addReg(TrueReg)
1225 .addReg(SReg);
1226 break;
1227 }
1228 case SIInstrInfo::SCC_FALSE: {
1229 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1230 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1231 : AMDGPU::S_CSELECT_B64), SReg)
1232 .addImm(0)
1233 .addImm(1);
1234 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1235 .addImm(0)
1236 .addReg(FalseReg)
1237 .addImm(0)
1238 .addReg(TrueReg)
1239 .addReg(SReg);
1240 break;
1241 }
1242 case SIInstrInfo::VCCNZ: {
1243 MachineOperand RegOp = Cond[1];
1244 RegOp.setImplicit(false);
1245 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1246 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1247 .add(RegOp);
1248 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1249 .addImm(0)
1250 .addReg(FalseReg)
1251 .addImm(0)
1252 .addReg(TrueReg)
1253 .addReg(SReg);
1254 break;
1255 }
1256 case SIInstrInfo::VCCZ: {
1257 MachineOperand RegOp = Cond[1];
1258 RegOp.setImplicit(false);
1259 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1260 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1261 .add(RegOp);
1262 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1263 .addImm(0)
1264 .addReg(TrueReg)
1265 .addImm(0)
1266 .addReg(FalseReg)
1267 .addReg(SReg);
1268 break;
1269 }
1270 case SIInstrInfo::EXECNZ: {
1271 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1272 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1273 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1274 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1275 .addImm(0);
1276 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1277 : AMDGPU::S_CSELECT_B64), SReg)
1278 .addImm(1)
1279 .addImm(0);
1280 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1281 .addImm(0)
1282 .addReg(FalseReg)
1283 .addImm(0)
1284 .addReg(TrueReg)
1285 .addReg(SReg);
1286 break;
1287 }
1288 case SIInstrInfo::EXECZ: {
1289 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1290 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1291 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1292 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1293 .addImm(0);
1294 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1295 : AMDGPU::S_CSELECT_B64), SReg)
1296 .addImm(0)
1297 .addImm(1);
1298 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1299 .addImm(0)
1300 .addReg(FalseReg)
1301 .addImm(0)
1302 .addReg(TrueReg)
1303 .addReg(SReg);
1304 llvm_unreachable("Unhandled branch predicate EXECZ");
1305 break;
1306 }
1307 default:
1308 llvm_unreachable("invalid branch predicate");
1309 }
1310 } else {
1311 llvm_unreachable("Can only handle Cond size 1 or 2");
1312 }
1313}
1314
1317 const DebugLoc &DL,
1318 Register SrcReg, int Value) const {
1319 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1320 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1321 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1322 .addImm(Value)
1323 .addReg(SrcReg);
1324
1325 return Reg;
1326}
1327
1330 const DebugLoc &DL,
1331 Register SrcReg, int Value) const {
1332 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1333 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1334 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1335 .addImm(Value)
1336 .addReg(SrcReg);
1337
1338 return Reg;
1339}
1340
1342 const Register Reg,
1343 int64_t &ImmVal) const {
1344 switch (MI.getOpcode()) {
1345 case AMDGPU::V_MOV_B32_e32:
1346 case AMDGPU::S_MOV_B32:
1347 case AMDGPU::S_MOVK_I32:
1348 case AMDGPU::S_MOV_B64:
1349 case AMDGPU::V_MOV_B64_e32:
1350 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1351 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1352 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1353 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1354 case AMDGPU::V_MOV_B64_PSEUDO: {
1355 const MachineOperand &Src0 = MI.getOperand(1);
1356 if (Src0.isImm()) {
1357 ImmVal = Src0.getImm();
1358 return MI.getOperand(0).getReg() == Reg;
1359 }
1360
1361 return false;
1362 }
1363 case AMDGPU::S_BREV_B32:
1364 case AMDGPU::V_BFREV_B32_e32:
1365 case AMDGPU::V_BFREV_B32_e64: {
1366 const MachineOperand &Src0 = MI.getOperand(1);
1367 if (Src0.isImm()) {
1368 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1369 return MI.getOperand(0).getReg() == Reg;
1370 }
1371
1372 return false;
1373 }
1374 case AMDGPU::S_NOT_B32:
1375 case AMDGPU::V_NOT_B32_e32:
1376 case AMDGPU::V_NOT_B32_e64: {
1377 const MachineOperand &Src0 = MI.getOperand(1);
1378 if (Src0.isImm()) {
1379 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1380 return MI.getOperand(0).getReg() == Reg;
1381 }
1382
1383 return false;
1384 }
1385 default:
1386 return false;
1387 }
1388}
1389
1391
1392 if (RI.isAGPRClass(DstRC))
1393 return AMDGPU::COPY;
1394 if (RI.getRegSizeInBits(*DstRC) == 16) {
1395 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1396 // before RA.
1397 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1398 }
1399 if (RI.getRegSizeInBits(*DstRC) == 32)
1400 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1401 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1402 return AMDGPU::S_MOV_B64;
1403 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1404 return AMDGPU::V_MOV_B64_PSEUDO;
1405 return AMDGPU::COPY;
1406}
1407
1408const MCInstrDesc &
1410 bool IsIndirectSrc) const {
1411 if (IsIndirectSrc) {
1412 if (VecSize <= 32) // 4 bytes
1413 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1414 if (VecSize <= 64) // 8 bytes
1415 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1416 if (VecSize <= 96) // 12 bytes
1417 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1418 if (VecSize <= 128) // 16 bytes
1419 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1420 if (VecSize <= 160) // 20 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1422 if (VecSize <= 256) // 32 bytes
1423 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1424 if (VecSize <= 288) // 36 bytes
1425 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1426 if (VecSize <= 320) // 40 bytes
1427 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1428 if (VecSize <= 352) // 44 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1430 if (VecSize <= 384) // 48 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1432 if (VecSize <= 512) // 64 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1434 if (VecSize <= 1024) // 128 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1436
1437 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1438 }
1439
1440 if (VecSize <= 32) // 4 bytes
1441 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1442 if (VecSize <= 64) // 8 bytes
1443 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1444 if (VecSize <= 96) // 12 bytes
1445 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1446 if (VecSize <= 128) // 16 bytes
1447 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1448 if (VecSize <= 160) // 20 bytes
1449 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1450 if (VecSize <= 256) // 32 bytes
1451 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1452 if (VecSize <= 288) // 36 bytes
1453 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1454 if (VecSize <= 320) // 40 bytes
1455 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1456 if (VecSize <= 352) // 44 bytes
1457 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1458 if (VecSize <= 384) // 48 bytes
1459 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1460 if (VecSize <= 512) // 64 bytes
1461 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1462 if (VecSize <= 1024) // 128 bytes
1463 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1464
1465 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1466}
1467
1468static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1469 if (VecSize <= 32) // 4 bytes
1470 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1471 if (VecSize <= 64) // 8 bytes
1472 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1473 if (VecSize <= 96) // 12 bytes
1474 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1475 if (VecSize <= 128) // 16 bytes
1476 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1477 if (VecSize <= 160) // 20 bytes
1478 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1479 if (VecSize <= 256) // 32 bytes
1480 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1481 if (VecSize <= 288) // 36 bytes
1482 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1483 if (VecSize <= 320) // 40 bytes
1484 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1485 if (VecSize <= 352) // 44 bytes
1486 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1487 if (VecSize <= 384) // 48 bytes
1488 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1489 if (VecSize <= 512) // 64 bytes
1490 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1491 if (VecSize <= 1024) // 128 bytes
1492 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1493
1494 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1495}
1496
1497static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1498 if (VecSize <= 32) // 4 bytes
1499 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1500 if (VecSize <= 64) // 8 bytes
1501 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1502 if (VecSize <= 96) // 12 bytes
1503 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1504 if (VecSize <= 128) // 16 bytes
1505 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1506 if (VecSize <= 160) // 20 bytes
1507 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1508 if (VecSize <= 256) // 32 bytes
1509 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1510 if (VecSize <= 288) // 36 bytes
1511 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1512 if (VecSize <= 320) // 40 bytes
1513 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1514 if (VecSize <= 352) // 44 bytes
1515 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1516 if (VecSize <= 384) // 48 bytes
1517 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1518 if (VecSize <= 512) // 64 bytes
1519 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1520 if (VecSize <= 1024) // 128 bytes
1521 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1522
1523 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1524}
1525
1526static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1527 if (VecSize <= 64) // 8 bytes
1528 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1529 if (VecSize <= 128) // 16 bytes
1530 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1531 if (VecSize <= 256) // 32 bytes
1532 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1533 if (VecSize <= 512) // 64 bytes
1534 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1535 if (VecSize <= 1024) // 128 bytes
1536 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1537
1538 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1539}
1540
1541const MCInstrDesc &
1542SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1543 bool IsSGPR) const {
1544 if (IsSGPR) {
1545 switch (EltSize) {
1546 case 32:
1547 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1548 case 64:
1549 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1550 default:
1551 llvm_unreachable("invalid reg indexing elt size");
1552 }
1553 }
1554
1555 assert(EltSize == 32 && "invalid reg indexing elt size");
1557}
1558
1559static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1560 switch (Size) {
1561 case 4:
1562 return AMDGPU::SI_SPILL_S32_SAVE;
1563 case 8:
1564 return AMDGPU::SI_SPILL_S64_SAVE;
1565 case 12:
1566 return AMDGPU::SI_SPILL_S96_SAVE;
1567 case 16:
1568 return AMDGPU::SI_SPILL_S128_SAVE;
1569 case 20:
1570 return AMDGPU::SI_SPILL_S160_SAVE;
1571 case 24:
1572 return AMDGPU::SI_SPILL_S192_SAVE;
1573 case 28:
1574 return AMDGPU::SI_SPILL_S224_SAVE;
1575 case 32:
1576 return AMDGPU::SI_SPILL_S256_SAVE;
1577 case 36:
1578 return AMDGPU::SI_SPILL_S288_SAVE;
1579 case 40:
1580 return AMDGPU::SI_SPILL_S320_SAVE;
1581 case 44:
1582 return AMDGPU::SI_SPILL_S352_SAVE;
1583 case 48:
1584 return AMDGPU::SI_SPILL_S384_SAVE;
1585 case 64:
1586 return AMDGPU::SI_SPILL_S512_SAVE;
1587 case 128:
1588 return AMDGPU::SI_SPILL_S1024_SAVE;
1589 default:
1590 llvm_unreachable("unknown register size");
1591 }
1592}
1593
1594static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1595 switch (Size) {
1596 case 2:
1597 return AMDGPU::SI_SPILL_V16_SAVE;
1598 case 4:
1599 return AMDGPU::SI_SPILL_V32_SAVE;
1600 case 8:
1601 return AMDGPU::SI_SPILL_V64_SAVE;
1602 case 12:
1603 return AMDGPU::SI_SPILL_V96_SAVE;
1604 case 16:
1605 return AMDGPU::SI_SPILL_V128_SAVE;
1606 case 20:
1607 return AMDGPU::SI_SPILL_V160_SAVE;
1608 case 24:
1609 return AMDGPU::SI_SPILL_V192_SAVE;
1610 case 28:
1611 return AMDGPU::SI_SPILL_V224_SAVE;
1612 case 32:
1613 return AMDGPU::SI_SPILL_V256_SAVE;
1614 case 36:
1615 return AMDGPU::SI_SPILL_V288_SAVE;
1616 case 40:
1617 return AMDGPU::SI_SPILL_V320_SAVE;
1618 case 44:
1619 return AMDGPU::SI_SPILL_V352_SAVE;
1620 case 48:
1621 return AMDGPU::SI_SPILL_V384_SAVE;
1622 case 64:
1623 return AMDGPU::SI_SPILL_V512_SAVE;
1624 case 128:
1625 return AMDGPU::SI_SPILL_V1024_SAVE;
1626 default:
1627 llvm_unreachable("unknown register size");
1628 }
1629}
1630
1631static unsigned getAVSpillSaveOpcode(unsigned Size) {
1632 switch (Size) {
1633 case 4:
1634 return AMDGPU::SI_SPILL_AV32_SAVE;
1635 case 8:
1636 return AMDGPU::SI_SPILL_AV64_SAVE;
1637 case 12:
1638 return AMDGPU::SI_SPILL_AV96_SAVE;
1639 case 16:
1640 return AMDGPU::SI_SPILL_AV128_SAVE;
1641 case 20:
1642 return AMDGPU::SI_SPILL_AV160_SAVE;
1643 case 24:
1644 return AMDGPU::SI_SPILL_AV192_SAVE;
1645 case 28:
1646 return AMDGPU::SI_SPILL_AV224_SAVE;
1647 case 32:
1648 return AMDGPU::SI_SPILL_AV256_SAVE;
1649 case 36:
1650 return AMDGPU::SI_SPILL_AV288_SAVE;
1651 case 40:
1652 return AMDGPU::SI_SPILL_AV320_SAVE;
1653 case 44:
1654 return AMDGPU::SI_SPILL_AV352_SAVE;
1655 case 48:
1656 return AMDGPU::SI_SPILL_AV384_SAVE;
1657 case 64:
1658 return AMDGPU::SI_SPILL_AV512_SAVE;
1659 case 128:
1660 return AMDGPU::SI_SPILL_AV1024_SAVE;
1661 default:
1662 llvm_unreachable("unknown register size");
1663 }
1664}
1665
1666static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1667 bool IsVectorSuperClass) {
1668 // Currently, there is only 32-bit WWM register spills needed.
1669 if (Size != 4)
1670 llvm_unreachable("unknown wwm register spill size");
1671
1672 if (IsVectorSuperClass)
1673 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1674
1675 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1676}
1677
1679 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1680 const SIMachineFunctionInfo &MFI) const {
1681 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1682
1683 // Choose the right opcode if spilling a WWM register.
1685 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1686
1687 // TODO: Check if AGPRs are available
1688 if (ST.hasMAIInsts())
1689 return getAVSpillSaveOpcode(Size);
1690
1692}
1693
1696 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1697 const TargetRegisterInfo *TRI, Register VReg,
1698 MachineInstr::MIFlag Flags) const {
1699 MachineFunction *MF = MBB.getParent();
1701 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1702 const DebugLoc &DL = MBB.findDebugLoc(MI);
1703
1704 MachinePointerInfo PtrInfo
1705 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1707 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1708 FrameInfo.getObjectAlign(FrameIndex));
1709 unsigned SpillSize = TRI->getSpillSize(*RC);
1710
1712 if (RI.isSGPRClass(RC)) {
1713 MFI->setHasSpilledSGPRs();
1714 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1715 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1716 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1717
1718 // We are only allowed to create one new instruction when spilling
1719 // registers, so we need to use pseudo instruction for spilling SGPRs.
1720 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1721
1722 // The SGPR spill/restore instructions only work on number sgprs, so we need
1723 // to make sure we are using the correct register class.
1724 if (SrcReg.isVirtual() && SpillSize == 4) {
1725 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1726 }
1727
1728 BuildMI(MBB, MI, DL, OpDesc)
1729 .addReg(SrcReg, getKillRegState(isKill)) // data
1730 .addFrameIndex(FrameIndex) // addr
1731 .addMemOperand(MMO)
1733
1734 if (RI.spillSGPRToVGPR())
1735 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1736 return;
1737 }
1738
1739 unsigned Opcode =
1740 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1741 MFI->setHasSpilledVGPRs();
1742
1743 BuildMI(MBB, MI, DL, get(Opcode))
1744 .addReg(SrcReg, getKillRegState(isKill)) // data
1745 .addFrameIndex(FrameIndex) // addr
1746 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1747 .addImm(0) // offset
1748 .addMemOperand(MMO);
1749}
1750
1751static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1752 switch (Size) {
1753 case 4:
1754 return AMDGPU::SI_SPILL_S32_RESTORE;
1755 case 8:
1756 return AMDGPU::SI_SPILL_S64_RESTORE;
1757 case 12:
1758 return AMDGPU::SI_SPILL_S96_RESTORE;
1759 case 16:
1760 return AMDGPU::SI_SPILL_S128_RESTORE;
1761 case 20:
1762 return AMDGPU::SI_SPILL_S160_RESTORE;
1763 case 24:
1764 return AMDGPU::SI_SPILL_S192_RESTORE;
1765 case 28:
1766 return AMDGPU::SI_SPILL_S224_RESTORE;
1767 case 32:
1768 return AMDGPU::SI_SPILL_S256_RESTORE;
1769 case 36:
1770 return AMDGPU::SI_SPILL_S288_RESTORE;
1771 case 40:
1772 return AMDGPU::SI_SPILL_S320_RESTORE;
1773 case 44:
1774 return AMDGPU::SI_SPILL_S352_RESTORE;
1775 case 48:
1776 return AMDGPU::SI_SPILL_S384_RESTORE;
1777 case 64:
1778 return AMDGPU::SI_SPILL_S512_RESTORE;
1779 case 128:
1780 return AMDGPU::SI_SPILL_S1024_RESTORE;
1781 default:
1782 llvm_unreachable("unknown register size");
1783 }
1784}
1785
1786static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1787 switch (Size) {
1788 case 2:
1789 return AMDGPU::SI_SPILL_V16_RESTORE;
1790 case 4:
1791 return AMDGPU::SI_SPILL_V32_RESTORE;
1792 case 8:
1793 return AMDGPU::SI_SPILL_V64_RESTORE;
1794 case 12:
1795 return AMDGPU::SI_SPILL_V96_RESTORE;
1796 case 16:
1797 return AMDGPU::SI_SPILL_V128_RESTORE;
1798 case 20:
1799 return AMDGPU::SI_SPILL_V160_RESTORE;
1800 case 24:
1801 return AMDGPU::SI_SPILL_V192_RESTORE;
1802 case 28:
1803 return AMDGPU::SI_SPILL_V224_RESTORE;
1804 case 32:
1805 return AMDGPU::SI_SPILL_V256_RESTORE;
1806 case 36:
1807 return AMDGPU::SI_SPILL_V288_RESTORE;
1808 case 40:
1809 return AMDGPU::SI_SPILL_V320_RESTORE;
1810 case 44:
1811 return AMDGPU::SI_SPILL_V352_RESTORE;
1812 case 48:
1813 return AMDGPU::SI_SPILL_V384_RESTORE;
1814 case 64:
1815 return AMDGPU::SI_SPILL_V512_RESTORE;
1816 case 128:
1817 return AMDGPU::SI_SPILL_V1024_RESTORE;
1818 default:
1819 llvm_unreachable("unknown register size");
1820 }
1821}
1822
1823static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1824 switch (Size) {
1825 case 4:
1826 return AMDGPU::SI_SPILL_AV32_RESTORE;
1827 case 8:
1828 return AMDGPU::SI_SPILL_AV64_RESTORE;
1829 case 12:
1830 return AMDGPU::SI_SPILL_AV96_RESTORE;
1831 case 16:
1832 return AMDGPU::SI_SPILL_AV128_RESTORE;
1833 case 20:
1834 return AMDGPU::SI_SPILL_AV160_RESTORE;
1835 case 24:
1836 return AMDGPU::SI_SPILL_AV192_RESTORE;
1837 case 28:
1838 return AMDGPU::SI_SPILL_AV224_RESTORE;
1839 case 32:
1840 return AMDGPU::SI_SPILL_AV256_RESTORE;
1841 case 36:
1842 return AMDGPU::SI_SPILL_AV288_RESTORE;
1843 case 40:
1844 return AMDGPU::SI_SPILL_AV320_RESTORE;
1845 case 44:
1846 return AMDGPU::SI_SPILL_AV352_RESTORE;
1847 case 48:
1848 return AMDGPU::SI_SPILL_AV384_RESTORE;
1849 case 64:
1850 return AMDGPU::SI_SPILL_AV512_RESTORE;
1851 case 128:
1852 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1853 default:
1854 llvm_unreachable("unknown register size");
1855 }
1856}
1857
1858static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1859 bool IsVectorSuperClass) {
1860 // Currently, there is only 32-bit WWM register spills needed.
1861 if (Size != 4)
1862 llvm_unreachable("unknown wwm register spill size");
1863
1864 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1865 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1866
1867 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1868}
1869
1871 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1872 const SIMachineFunctionInfo &MFI) const {
1873 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1874
1875 // Choose the right opcode if restoring a WWM register.
1877 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1878
1879 // TODO: Check if AGPRs are available
1880 if (ST.hasMAIInsts())
1882
1883 assert(!RI.isAGPRClass(RC));
1885}
1886
1889 Register DestReg, int FrameIndex,
1890 const TargetRegisterClass *RC,
1891 const TargetRegisterInfo *TRI,
1892 Register VReg,
1893 MachineInstr::MIFlag Flags) const {
1894 MachineFunction *MF = MBB.getParent();
1896 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1897 const DebugLoc &DL = MBB.findDebugLoc(MI);
1898 unsigned SpillSize = TRI->getSpillSize(*RC);
1899
1900 MachinePointerInfo PtrInfo
1901 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1902
1904 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1905 FrameInfo.getObjectAlign(FrameIndex));
1906
1907 if (RI.isSGPRClass(RC)) {
1908 MFI->setHasSpilledSGPRs();
1909 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1910 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1911 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1912
1913 // FIXME: Maybe this should not include a memoperand because it will be
1914 // lowered to non-memory instructions.
1915 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1916 if (DestReg.isVirtual() && SpillSize == 4) {
1918 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1919 }
1920
1921 if (RI.spillSGPRToVGPR())
1922 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1923 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1924 .addFrameIndex(FrameIndex) // addr
1925 .addMemOperand(MMO)
1927
1928 return;
1929 }
1930
1931 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1932 SpillSize, *MFI);
1933 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1934 .addFrameIndex(FrameIndex) // vaddr
1935 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1936 .addImm(0) // offset
1937 .addMemOperand(MMO);
1938}
1939
1944
1947 unsigned Quantity) const {
1948 DebugLoc DL = MBB.findDebugLoc(MI);
1949 while (Quantity > 0) {
1950 unsigned Arg = std::min(Quantity, 8u);
1951 Quantity -= Arg;
1952 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1953 }
1954}
1955
1957 auto *MF = MBB.getParent();
1958 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1959
1960 assert(Info->isEntryFunction());
1961
1962 if (MBB.succ_empty()) {
1963 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1964 if (HasNoTerminator) {
1965 if (Info->returnsVoid()) {
1966 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1967 } else {
1968 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1969 }
1970 }
1971 }
1972}
1973
1977 const DebugLoc &DL) const {
1978 MachineFunction *MF = MBB.getParent();
1979 constexpr unsigned DoorbellIDMask = 0x3ff;
1980 constexpr unsigned ECQueueWaveAbort = 0x400;
1981
1982 MachineBasicBlock *TrapBB = &MBB;
1983 MachineBasicBlock *ContBB = &MBB;
1984 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1985
1986 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1987 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1988 TrapBB = MF->CreateMachineBasicBlock();
1989 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1990 MF->push_back(TrapBB);
1991 MBB.addSuccessor(TrapBB);
1992 }
1993
1994 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1995 // will be a nop.
1996 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1997 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1998 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1999 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2000 DoorbellReg)
2002 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2003 .addUse(AMDGPU::M0);
2004 Register DoorbellRegMasked =
2005 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2006 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2007 .addUse(DoorbellReg)
2008 .addImm(DoorbellIDMask);
2009 Register SetWaveAbortBit =
2010 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2011 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2012 .addUse(DoorbellRegMasked)
2013 .addImm(ECQueueWaveAbort);
2014 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2015 .addUse(SetWaveAbortBit);
2016 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2018 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2019 .addUse(AMDGPU::TTMP2);
2020 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2021 TrapBB->addSuccessor(HaltLoopBB);
2022
2023 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2024 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2025 .addMBB(HaltLoopBB);
2026 MF->push_back(HaltLoopBB);
2027 HaltLoopBB->addSuccessor(HaltLoopBB);
2028
2029 return ContBB;
2030}
2031
2033 switch (MI.getOpcode()) {
2034 default:
2035 if (MI.isMetaInstruction())
2036 return 0;
2037 return 1; // FIXME: Do wait states equal cycles?
2038
2039 case AMDGPU::S_NOP:
2040 return MI.getOperand(0).getImm() + 1;
2041 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2042 // hazard, even if one exist, won't really be visible. Should we handle it?
2043 }
2044}
2045
2047 MachineBasicBlock &MBB = *MI.getParent();
2048 DebugLoc DL = MBB.findDebugLoc(MI);
2049 switch (MI.getOpcode()) {
2050 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2051 case AMDGPU::S_MOV_B64_term:
2052 // This is only a terminator to get the correct spill code placement during
2053 // register allocation.
2054 MI.setDesc(get(AMDGPU::S_MOV_B64));
2055 break;
2056
2057 case AMDGPU::S_MOV_B32_term:
2058 // This is only a terminator to get the correct spill code placement during
2059 // register allocation.
2060 MI.setDesc(get(AMDGPU::S_MOV_B32));
2061 break;
2062
2063 case AMDGPU::S_XOR_B64_term:
2064 // This is only a terminator to get the correct spill code placement during
2065 // register allocation.
2066 MI.setDesc(get(AMDGPU::S_XOR_B64));
2067 break;
2068
2069 case AMDGPU::S_XOR_B32_term:
2070 // This is only a terminator to get the correct spill code placement during
2071 // register allocation.
2072 MI.setDesc(get(AMDGPU::S_XOR_B32));
2073 break;
2074 case AMDGPU::S_OR_B64_term:
2075 // This is only a terminator to get the correct spill code placement during
2076 // register allocation.
2077 MI.setDesc(get(AMDGPU::S_OR_B64));
2078 break;
2079 case AMDGPU::S_OR_B32_term:
2080 // This is only a terminator to get the correct spill code placement during
2081 // register allocation.
2082 MI.setDesc(get(AMDGPU::S_OR_B32));
2083 break;
2084
2085 case AMDGPU::S_ANDN2_B64_term:
2086 // This is only a terminator to get the correct spill code placement during
2087 // register allocation.
2088 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2089 break;
2090
2091 case AMDGPU::S_ANDN2_B32_term:
2092 // This is only a terminator to get the correct spill code placement during
2093 // register allocation.
2094 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2095 break;
2096
2097 case AMDGPU::S_AND_B64_term:
2098 // This is only a terminator to get the correct spill code placement during
2099 // register allocation.
2100 MI.setDesc(get(AMDGPU::S_AND_B64));
2101 break;
2102
2103 case AMDGPU::S_AND_B32_term:
2104 // This is only a terminator to get the correct spill code placement during
2105 // register allocation.
2106 MI.setDesc(get(AMDGPU::S_AND_B32));
2107 break;
2108
2109 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2110 // This is only a terminator to get the correct spill code placement during
2111 // register allocation.
2112 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2113 break;
2114
2115 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2116 // This is only a terminator to get the correct spill code placement during
2117 // register allocation.
2118 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2119 break;
2120
2121 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2122 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2123 break;
2124
2125 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2126 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2127 MI.getMF()->getRegInfo().constrainRegClass(MI.getOperand(0).getReg(),
2128 &AMDGPU::SReg_32_XM0RegClass);
2129 break;
2130 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2131 Register Dst = MI.getOperand(0).getReg();
2132 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2133 MI.setDesc(
2134 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2135 break;
2136 }
2137 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2138 Register Dst = MI.getOperand(0).getReg();
2139 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2140 int64_t Imm = MI.getOperand(1).getImm();
2141
2142 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2143 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2144 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2147 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2148 .addImm(SignExtend64<32>(Imm >> 32))
2150 MI.eraseFromParent();
2151 break;
2152 }
2153
2154 [[fallthrough]];
2155 }
2156 case AMDGPU::V_MOV_B64_PSEUDO: {
2157 Register Dst = MI.getOperand(0).getReg();
2158 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2159 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2160
2161 const MachineOperand &SrcOp = MI.getOperand(1);
2162 // FIXME: Will this work for 64-bit floating point immediates?
2163 assert(!SrcOp.isFPImm());
2164 if (ST.hasMovB64()) {
2165 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2166 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2167 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2168 break;
2169 }
2170 if (SrcOp.isImm()) {
2171 APInt Imm(64, SrcOp.getImm());
2172 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2173 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2174 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2175 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2177 .addImm(Lo.getSExtValue())
2179 .addImm(Lo.getSExtValue())
2180 .addImm(0) // op_sel_lo
2181 .addImm(0) // op_sel_hi
2182 .addImm(0) // neg_lo
2183 .addImm(0) // neg_hi
2184 .addImm(0); // clamp
2185 } else {
2186 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2187 .addImm(Lo.getSExtValue())
2189 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2190 .addImm(Hi.getSExtValue())
2192 }
2193 } else {
2194 assert(SrcOp.isReg());
2195 if (ST.hasPkMovB32() &&
2196 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2197 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2198 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2199 .addReg(SrcOp.getReg())
2201 .addReg(SrcOp.getReg())
2202 .addImm(0) // op_sel_lo
2203 .addImm(0) // op_sel_hi
2204 .addImm(0) // neg_lo
2205 .addImm(0) // neg_hi
2206 .addImm(0); // clamp
2207 } else {
2208 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2209 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2211 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2212 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2214 }
2215 }
2216 MI.eraseFromParent();
2217 break;
2218 }
2219 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2221 break;
2222 }
2223 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2224 const MachineOperand &SrcOp = MI.getOperand(1);
2225 assert(!SrcOp.isFPImm());
2226
2227 if (ST.has64BitLiterals()) {
2228 MI.setDesc(get(AMDGPU::S_MOV_B64));
2229 break;
2230 }
2231
2232 APInt Imm(64, SrcOp.getImm());
2233 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2234 MI.setDesc(get(AMDGPU::S_MOV_B64));
2235 break;
2236 }
2237
2238 Register Dst = MI.getOperand(0).getReg();
2239 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2240 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2241
2242 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2243 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2244 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2245 .addImm(Lo.getSExtValue())
2247 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2248 .addImm(Hi.getSExtValue())
2250 MI.eraseFromParent();
2251 break;
2252 }
2253 case AMDGPU::V_SET_INACTIVE_B32: {
2254 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2255 Register DstReg = MI.getOperand(0).getReg();
2256 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2257 .add(MI.getOperand(3))
2258 .add(MI.getOperand(4))
2259 .add(MI.getOperand(1))
2260 .add(MI.getOperand(2))
2261 .add(MI.getOperand(5));
2262 MI.eraseFromParent();
2263 break;
2264 }
2265 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2266 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2267 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2268 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2269 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2270 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2271 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2272 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2273 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2274 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2275 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2276 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2277 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2278 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2279 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2280 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2281 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2282 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2283 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2284 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2285 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2286 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2287 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2288 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2289 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2290 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2291 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2292 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2293 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2294 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2295
2296 unsigned Opc;
2297 if (RI.hasVGPRs(EltRC)) {
2298 Opc = AMDGPU::V_MOVRELD_B32_e32;
2299 } else {
2300 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2301 : AMDGPU::S_MOVRELD_B32;
2302 }
2303
2304 const MCInstrDesc &OpDesc = get(Opc);
2305 Register VecReg = MI.getOperand(0).getReg();
2306 bool IsUndef = MI.getOperand(1).isUndef();
2307 unsigned SubReg = MI.getOperand(3).getImm();
2308 assert(VecReg == MI.getOperand(1).getReg());
2309
2311 BuildMI(MBB, MI, DL, OpDesc)
2312 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2313 .add(MI.getOperand(2))
2315 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2316
2317 const int ImpDefIdx =
2318 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2319 const int ImpUseIdx = ImpDefIdx + 1;
2320 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2321 MI.eraseFromParent();
2322 break;
2323 }
2324 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2325 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2326 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2327 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2328 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2329 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2330 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2331 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2332 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2333 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2334 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2335 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2336 assert(ST.useVGPRIndexMode());
2337 Register VecReg = MI.getOperand(0).getReg();
2338 bool IsUndef = MI.getOperand(1).isUndef();
2339 MachineOperand &Idx = MI.getOperand(3);
2340 Register SubReg = MI.getOperand(4).getImm();
2341
2342 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2343 .add(Idx)
2345 SetOn->getOperand(3).setIsUndef();
2346
2347 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2349 BuildMI(MBB, MI, DL, OpDesc)
2350 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2351 .add(MI.getOperand(2))
2353 .addReg(VecReg,
2354 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2355
2356 const int ImpDefIdx =
2357 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2358 const int ImpUseIdx = ImpDefIdx + 1;
2359 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2360
2361 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2362
2363 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2364
2365 MI.eraseFromParent();
2366 break;
2367 }
2368 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2369 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2370 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2371 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2372 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2373 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2374 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2375 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2376 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2377 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2378 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2379 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2380 assert(ST.useVGPRIndexMode());
2381 Register Dst = MI.getOperand(0).getReg();
2382 Register VecReg = MI.getOperand(1).getReg();
2383 bool IsUndef = MI.getOperand(1).isUndef();
2384 Register Idx = MI.getOperand(2).getReg();
2385 Register SubReg = MI.getOperand(3).getImm();
2386
2387 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2388 .addReg(Idx)
2390 SetOn->getOperand(3).setIsUndef();
2391
2392 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2393 .addDef(Dst)
2394 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2395 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2396
2397 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2398
2399 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2400
2401 MI.eraseFromParent();
2402 break;
2403 }
2404 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2405 MachineFunction &MF = *MBB.getParent();
2406 Register Reg = MI.getOperand(0).getReg();
2407 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2408 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2409 MachineOperand OpLo = MI.getOperand(1);
2410 MachineOperand OpHi = MI.getOperand(2);
2411
2412 // Create a bundle so these instructions won't be re-ordered by the
2413 // post-RA scheduler.
2414 MIBundleBuilder Bundler(MBB, MI);
2415 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2416
2417 // What we want here is an offset from the value returned by s_getpc (which
2418 // is the address of the s_add_u32 instruction) to the global variable, but
2419 // since the encoding of $symbol starts 4 bytes after the start of the
2420 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2421 // small. This requires us to add 4 to the global variable offset in order
2422 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2423 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2424 // instruction.
2425
2426 int64_t Adjust = 0;
2427 if (ST.hasGetPCZeroExtension()) {
2428 // Fix up hardware that does not sign-extend the 48-bit PC value by
2429 // inserting: s_sext_i32_i16 reghi, reghi
2430 Bundler.append(
2431 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2432 Adjust += 4;
2433 }
2434
2435 if (OpLo.isGlobal())
2436 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2437 Bundler.append(
2438 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2439
2440 if (OpHi.isGlobal())
2441 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2442 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2443 .addReg(RegHi)
2444 .add(OpHi));
2445
2446 finalizeBundle(MBB, Bundler.begin());
2447
2448 MI.eraseFromParent();
2449 break;
2450 }
2451 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2452 MachineFunction &MF = *MBB.getParent();
2453 Register Reg = MI.getOperand(0).getReg();
2454 MachineOperand Op = MI.getOperand(1);
2455
2456 // Create a bundle so these instructions won't be re-ordered by the
2457 // post-RA scheduler.
2458 MIBundleBuilder Bundler(MBB, MI);
2459 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2460 if (Op.isGlobal())
2461 Op.setOffset(Op.getOffset() + 4);
2462 Bundler.append(
2463 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2464
2465 finalizeBundle(MBB, Bundler.begin());
2466
2467 MI.eraseFromParent();
2468 break;
2469 }
2470 case AMDGPU::ENTER_STRICT_WWM: {
2471 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2472 // Whole Wave Mode is entered.
2473 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2474 : AMDGPU::S_OR_SAVEEXEC_B64));
2475 break;
2476 }
2477 case AMDGPU::ENTER_STRICT_WQM: {
2478 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2479 // STRICT_WQM is entered.
2480 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2481 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2482 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2483 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2484 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2485
2486 MI.eraseFromParent();
2487 break;
2488 }
2489 case AMDGPU::EXIT_STRICT_WWM:
2490 case AMDGPU::EXIT_STRICT_WQM: {
2491 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2492 // WWM/STICT_WQM is exited.
2493 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2494 break;
2495 }
2496 case AMDGPU::SI_RETURN: {
2497 const MachineFunction *MF = MBB.getParent();
2498 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2499 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2500 // Hiding the return address use with SI_RETURN may lead to extra kills in
2501 // the function and missing live-ins. We are fine in practice because callee
2502 // saved register handling ensures the register value is restored before
2503 // RET, but we need the undef flag here to appease the MachineVerifier
2504 // liveness checks.
2506 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2507 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2508
2509 MIB.copyImplicitOps(MI);
2510 MI.eraseFromParent();
2511 break;
2512 }
2513
2514 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2515 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2516 MI.setDesc(get(AMDGPU::S_MUL_U64));
2517 break;
2518
2519 case AMDGPU::S_GETPC_B64_pseudo:
2520 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2521 if (ST.hasGetPCZeroExtension()) {
2522 Register Dst = MI.getOperand(0).getReg();
2523 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2524 // Fix up hardware that does not sign-extend the 48-bit PC value by
2525 // inserting: s_sext_i32_i16 dsthi, dsthi
2526 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2527 DstHi)
2528 .addReg(DstHi);
2529 }
2530 break;
2531
2532 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2533 assert(ST.hasBF16PackedInsts());
2534 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2535 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2536 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2537 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2538 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2539 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2540 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2541 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2542 break;
2543 }
2544
2545 return true;
2546}
2547
2550 unsigned SubIdx, const MachineInstr &Orig,
2551 const TargetRegisterInfo &RI) const {
2552
2553 // Try shrinking the instruction to remat only the part needed for current
2554 // context.
2555 // TODO: Handle more cases.
2556 unsigned Opcode = Orig.getOpcode();
2557 switch (Opcode) {
2558 case AMDGPU::S_LOAD_DWORDX16_IMM:
2559 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2560 if (SubIdx != 0)
2561 break;
2562
2563 if (I == MBB.end())
2564 break;
2565
2566 if (I->isBundled())
2567 break;
2568
2569 // Look for a single use of the register that is also a subreg.
2570 Register RegToFind = Orig.getOperand(0).getReg();
2571 MachineOperand *UseMO = nullptr;
2572 for (auto &CandMO : I->operands()) {
2573 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2574 continue;
2575 if (UseMO) {
2576 UseMO = nullptr;
2577 break;
2578 }
2579 UseMO = &CandMO;
2580 }
2581 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2582 break;
2583
2584 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2585 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2586
2587 MachineFunction *MF = MBB.getParent();
2589 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2590
2591 unsigned NewOpcode = -1;
2592 if (SubregSize == 256)
2593 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2594 else if (SubregSize == 128)
2595 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2596 else
2597 break;
2598
2599 const MCInstrDesc &TID = get(NewOpcode);
2600 const TargetRegisterClass *NewRC =
2601 RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2602 MRI.setRegClass(DestReg, NewRC);
2603
2604 UseMO->setReg(DestReg);
2605 UseMO->setSubReg(AMDGPU::NoSubRegister);
2606
2607 // Use a smaller load with the desired size, possibly with updated offset.
2608 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2609 MI->setDesc(TID);
2610 MI->getOperand(0).setReg(DestReg);
2611 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2612 if (Offset) {
2613 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2614 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2615 OffsetMO->setImm(FinalOffset);
2616 }
2618 for (const MachineMemOperand *MemOp : Orig.memoperands())
2619 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2620 SubregSize / 8));
2621 MI->setMemRefs(*MF, NewMMOs);
2622
2623 MBB.insert(I, MI);
2624 return;
2625 }
2626
2627 default:
2628 break;
2629 }
2630
2631 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2632}
2633
2634std::pair<MachineInstr*, MachineInstr*>
2636 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2637
2638 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2640 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2641 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2642 return std::pair(&MI, nullptr);
2643 }
2644
2645 MachineBasicBlock &MBB = *MI.getParent();
2646 DebugLoc DL = MBB.findDebugLoc(MI);
2647 MachineFunction *MF = MBB.getParent();
2649 Register Dst = MI.getOperand(0).getReg();
2650 unsigned Part = 0;
2651 MachineInstr *Split[2];
2652
2653 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2654 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2655 if (Dst.isPhysical()) {
2656 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2657 } else {
2658 assert(MRI.isSSA());
2659 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2660 MovDPP.addDef(Tmp);
2661 }
2662
2663 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2664 const MachineOperand &SrcOp = MI.getOperand(I);
2665 assert(!SrcOp.isFPImm());
2666 if (SrcOp.isImm()) {
2667 APInt Imm(64, SrcOp.getImm());
2668 Imm.ashrInPlace(Part * 32);
2669 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2670 } else {
2671 assert(SrcOp.isReg());
2672 Register Src = SrcOp.getReg();
2673 if (Src.isPhysical())
2674 MovDPP.addReg(RI.getSubReg(Src, Sub));
2675 else
2676 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2677 }
2678 }
2679
2680 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2681 MovDPP.addImm(MO.getImm());
2682
2683 Split[Part] = MovDPP;
2684 ++Part;
2685 }
2686
2687 if (Dst.isVirtual())
2688 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2689 .addReg(Split[0]->getOperand(0).getReg())
2690 .addImm(AMDGPU::sub0)
2691 .addReg(Split[1]->getOperand(0).getReg())
2692 .addImm(AMDGPU::sub1);
2693
2694 MI.eraseFromParent();
2695 return std::pair(Split[0], Split[1]);
2696}
2697
2698std::optional<DestSourcePair>
2700 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2701 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2702
2703 return std::nullopt;
2704}
2705
2707 AMDGPU::OpName Src0OpName,
2708 MachineOperand &Src1,
2709 AMDGPU::OpName Src1OpName) const {
2710 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2711 if (!Src0Mods)
2712 return false;
2713
2714 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2715 assert(Src1Mods &&
2716 "All commutable instructions have both src0 and src1 modifiers");
2717
2718 int Src0ModsVal = Src0Mods->getImm();
2719 int Src1ModsVal = Src1Mods->getImm();
2720
2721 Src1Mods->setImm(Src0ModsVal);
2722 Src0Mods->setImm(Src1ModsVal);
2723 return true;
2724}
2725
2727 MachineOperand &RegOp,
2728 MachineOperand &NonRegOp) {
2729 Register Reg = RegOp.getReg();
2730 unsigned SubReg = RegOp.getSubReg();
2731 bool IsKill = RegOp.isKill();
2732 bool IsDead = RegOp.isDead();
2733 bool IsUndef = RegOp.isUndef();
2734 bool IsDebug = RegOp.isDebug();
2735
2736 if (NonRegOp.isImm())
2737 RegOp.ChangeToImmediate(NonRegOp.getImm());
2738 else if (NonRegOp.isFI())
2739 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2740 else if (NonRegOp.isGlobal()) {
2741 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2742 NonRegOp.getTargetFlags());
2743 } else
2744 return nullptr;
2745
2746 // Make sure we don't reinterpret a subreg index in the target flags.
2747 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2748
2749 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2750 NonRegOp.setSubReg(SubReg);
2751
2752 return &MI;
2753}
2754
2756 MachineOperand &NonRegOp1,
2757 MachineOperand &NonRegOp2) {
2758 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2759 int64_t NonRegVal = NonRegOp1.getImm();
2760
2761 NonRegOp1.setImm(NonRegOp2.getImm());
2762 NonRegOp2.setImm(NonRegVal);
2763 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2764 NonRegOp2.setTargetFlags(TargetFlags);
2765 return &MI;
2766}
2767
2768bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2769 unsigned OpIdx1) const {
2770 const MCInstrDesc &InstDesc = MI.getDesc();
2771 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2772 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2773
2774 unsigned Opc = MI.getOpcode();
2775 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2776
2777 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2778 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2779
2780 // Swap doesn't breach constant bus or literal limits
2781 // It may move literal to position other than src0, this is not allowed
2782 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2783 // FIXME: After gfx9, literal can be in place other than Src0
2784 if (isVALU(MI)) {
2785 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2786 !isInlineConstant(MO0, OpInfo1))
2787 return false;
2788 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2789 !isInlineConstant(MO1, OpInfo0))
2790 return false;
2791 }
2792
2793 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2794 if (OpInfo1.RegClass == -1)
2795 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2796 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2797 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2798 }
2799 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2800 if (OpInfo0.RegClass == -1)
2801 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2802 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2803 isLegalRegOperand(MI, OpIdx0, MO1);
2804 }
2805
2806 // No need to check 64-bit literals since swapping does not bring new
2807 // 64-bit literals into current instruction to fold to 32-bit
2808
2809 return isImmOperandLegal(MI, OpIdx1, MO0);
2810}
2811
2813 unsigned Src0Idx,
2814 unsigned Src1Idx) const {
2815 assert(!NewMI && "this should never be used");
2816
2817 unsigned Opc = MI.getOpcode();
2818 int CommutedOpcode = commuteOpcode(Opc);
2819 if (CommutedOpcode == -1)
2820 return nullptr;
2821
2822 if (Src0Idx > Src1Idx)
2823 std::swap(Src0Idx, Src1Idx);
2824
2825 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2826 static_cast<int>(Src0Idx) &&
2827 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2828 static_cast<int>(Src1Idx) &&
2829 "inconsistency with findCommutedOpIndices");
2830
2831 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2832 return nullptr;
2833
2834 MachineInstr *CommutedMI = nullptr;
2835 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2836 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2837 if (Src0.isReg() && Src1.isReg()) {
2838 // Be sure to copy the source modifiers to the right place.
2839 CommutedMI =
2840 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2841 } else if (Src0.isReg() && !Src1.isReg()) {
2842 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2843 } else if (!Src0.isReg() && Src1.isReg()) {
2844 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2845 } else if (Src0.isImm() && Src1.isImm()) {
2846 CommutedMI = swapImmOperands(MI, Src0, Src1);
2847 } else {
2848 // FIXME: Found two non registers to commute. This does happen.
2849 return nullptr;
2850 }
2851
2852 if (CommutedMI) {
2853 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2854 Src1, AMDGPU::OpName::src1_modifiers);
2855
2856 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2857 AMDGPU::OpName::src1_sel);
2858
2859 CommutedMI->setDesc(get(CommutedOpcode));
2860 }
2861
2862 return CommutedMI;
2863}
2864
2865// This needs to be implemented because the source modifiers may be inserted
2866// between the true commutable operands, and the base
2867// TargetInstrInfo::commuteInstruction uses it.
2869 unsigned &SrcOpIdx0,
2870 unsigned &SrcOpIdx1) const {
2871 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2872}
2873
2875 unsigned &SrcOpIdx0,
2876 unsigned &SrcOpIdx1) const {
2877 if (!Desc.isCommutable())
2878 return false;
2879
2880 unsigned Opc = Desc.getOpcode();
2881 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2882 if (Src0Idx == -1)
2883 return false;
2884
2885 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2886 if (Src1Idx == -1)
2887 return false;
2888
2889 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2890}
2891
2893 int64_t BrOffset) const {
2894 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2895 // because its dest block is unanalyzable.
2896 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2897
2898 // Convert to dwords.
2899 BrOffset /= 4;
2900
2901 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2902 // from the next instruction.
2903 BrOffset -= 1;
2904
2905 return isIntN(BranchOffsetBits, BrOffset);
2906}
2907
2910 return MI.getOperand(0).getMBB();
2911}
2912
2914 for (const MachineInstr &MI : MBB->terminators()) {
2915 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2916 MI.getOpcode() == AMDGPU::SI_LOOP)
2917 return true;
2918 }
2919 return false;
2920}
2921
2923 MachineBasicBlock &DestBB,
2924 MachineBasicBlock &RestoreBB,
2925 const DebugLoc &DL, int64_t BrOffset,
2926 RegScavenger *RS) const {
2927 assert(MBB.empty() &&
2928 "new block should be inserted for expanding unconditional branch");
2929 assert(MBB.pred_size() == 1);
2930 assert(RestoreBB.empty() &&
2931 "restore block should be inserted for restoring clobbered registers");
2932
2933 MachineFunction *MF = MBB.getParent();
2936 auto I = MBB.end();
2937 auto &MCCtx = MF->getContext();
2938
2939 if (ST.hasAddPC64Inst()) {
2940 MCSymbol *Offset =
2941 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2942 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2944 MCSymbol *PostAddPCLabel =
2945 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2946 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2947 auto *OffsetExpr = MCBinaryExpr::createSub(
2948 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2949 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2950 Offset->setVariableValue(OffsetExpr);
2951 return;
2952 }
2953
2954 assert(RS && "RegScavenger required for long branching");
2955
2956 // FIXME: Virtual register workaround for RegScavenger not working with empty
2957 // blocks.
2958 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2959
2960 // Note: as this is used after hazard recognizer we need to apply some hazard
2961 // workarounds directly.
2962 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2963 ST.hasVALUReadSGPRHazard();
2964 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2965 if (FlushSGPRWrites)
2966 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2968 };
2969
2970 // We need to compute the offset relative to the instruction immediately after
2971 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2972 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2973 ApplyHazardWorkarounds();
2974
2975 MCSymbol *PostGetPCLabel =
2976 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2977 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2978
2979 MCSymbol *OffsetLo =
2980 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2981 MCSymbol *OffsetHi =
2982 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2983 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2984 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2985 .addReg(PCReg, 0, AMDGPU::sub0)
2986 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2987 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2988 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2989 .addReg(PCReg, 0, AMDGPU::sub1)
2990 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2991 ApplyHazardWorkarounds();
2992
2993 // Insert the indirect branch after the other terminator.
2994 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2995 .addReg(PCReg);
2996
2997 // If a spill is needed for the pc register pair, we need to insert a spill
2998 // restore block right before the destination block, and insert a short branch
2999 // into the old destination block's fallthrough predecessor.
3000 // e.g.:
3001 //
3002 // s_cbranch_scc0 skip_long_branch:
3003 //
3004 // long_branch_bb:
3005 // spill s[8:9]
3006 // s_getpc_b64 s[8:9]
3007 // s_add_u32 s8, s8, restore_bb
3008 // s_addc_u32 s9, s9, 0
3009 // s_setpc_b64 s[8:9]
3010 //
3011 // skip_long_branch:
3012 // foo;
3013 //
3014 // .....
3015 //
3016 // dest_bb_fallthrough_predecessor:
3017 // bar;
3018 // s_branch dest_bb
3019 //
3020 // restore_bb:
3021 // restore s[8:9]
3022 // fallthrough dest_bb
3023 ///
3024 // dest_bb:
3025 // buzz;
3026
3027 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3028 Register Scav;
3029
3030 // If we've previously reserved a register for long branches
3031 // avoid running the scavenger and just use those registers
3032 if (LongBranchReservedReg) {
3033 RS->enterBasicBlock(MBB);
3034 Scav = LongBranchReservedReg;
3035 } else {
3037 Scav = RS->scavengeRegisterBackwards(
3038 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3039 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3040 }
3041 if (Scav) {
3042 RS->setRegUsed(Scav);
3043 MRI.replaceRegWith(PCReg, Scav);
3044 MRI.clearVirtRegs();
3045 } else {
3046 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3047 // SGPR spill.
3048 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3049 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3050 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3051 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3052 MRI.clearVirtRegs();
3053 }
3054
3055 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3056 // Now, the distance could be defined.
3058 MCSymbolRefExpr::create(DestLabel, MCCtx),
3059 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3060 // Add offset assignments.
3061 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3062 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3063 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3064 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3065}
3066
3067unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3068 switch (Cond) {
3069 case SIInstrInfo::SCC_TRUE:
3070 return AMDGPU::S_CBRANCH_SCC1;
3071 case SIInstrInfo::SCC_FALSE:
3072 return AMDGPU::S_CBRANCH_SCC0;
3073 case SIInstrInfo::VCCNZ:
3074 return AMDGPU::S_CBRANCH_VCCNZ;
3075 case SIInstrInfo::VCCZ:
3076 return AMDGPU::S_CBRANCH_VCCZ;
3077 case SIInstrInfo::EXECNZ:
3078 return AMDGPU::S_CBRANCH_EXECNZ;
3079 case SIInstrInfo::EXECZ:
3080 return AMDGPU::S_CBRANCH_EXECZ;
3081 default:
3082 llvm_unreachable("invalid branch predicate");
3083 }
3084}
3085
3086SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3087 switch (Opcode) {
3088 case AMDGPU::S_CBRANCH_SCC0:
3089 return SCC_FALSE;
3090 case AMDGPU::S_CBRANCH_SCC1:
3091 return SCC_TRUE;
3092 case AMDGPU::S_CBRANCH_VCCNZ:
3093 return VCCNZ;
3094 case AMDGPU::S_CBRANCH_VCCZ:
3095 return VCCZ;
3096 case AMDGPU::S_CBRANCH_EXECNZ:
3097 return EXECNZ;
3098 case AMDGPU::S_CBRANCH_EXECZ:
3099 return EXECZ;
3100 default:
3101 return INVALID_BR;
3102 }
3103}
3104
3108 MachineBasicBlock *&FBB,
3110 bool AllowModify) const {
3111 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3112 // Unconditional Branch
3113 TBB = I->getOperand(0).getMBB();
3114 return false;
3115 }
3116
3117 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3118 if (Pred == INVALID_BR)
3119 return true;
3120
3121 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3122 Cond.push_back(MachineOperand::CreateImm(Pred));
3123 Cond.push_back(I->getOperand(1)); // Save the branch register.
3124
3125 ++I;
3126
3127 if (I == MBB.end()) {
3128 // Conditional branch followed by fall-through.
3129 TBB = CondBB;
3130 return false;
3131 }
3132
3133 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3134 TBB = CondBB;
3135 FBB = I->getOperand(0).getMBB();
3136 return false;
3137 }
3138
3139 return true;
3140}
3141
3143 MachineBasicBlock *&FBB,
3145 bool AllowModify) const {
3146 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3147 auto E = MBB.end();
3148 if (I == E)
3149 return false;
3150
3151 // Skip over the instructions that are artificially terminators for special
3152 // exec management.
3153 while (I != E && !I->isBranch() && !I->isReturn()) {
3154 switch (I->getOpcode()) {
3155 case AMDGPU::S_MOV_B64_term:
3156 case AMDGPU::S_XOR_B64_term:
3157 case AMDGPU::S_OR_B64_term:
3158 case AMDGPU::S_ANDN2_B64_term:
3159 case AMDGPU::S_AND_B64_term:
3160 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3161 case AMDGPU::S_MOV_B32_term:
3162 case AMDGPU::S_XOR_B32_term:
3163 case AMDGPU::S_OR_B32_term:
3164 case AMDGPU::S_ANDN2_B32_term:
3165 case AMDGPU::S_AND_B32_term:
3166 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3167 break;
3168 case AMDGPU::SI_IF:
3169 case AMDGPU::SI_ELSE:
3170 case AMDGPU::SI_KILL_I1_TERMINATOR:
3171 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3172 // FIXME: It's messy that these need to be considered here at all.
3173 return true;
3174 default:
3175 llvm_unreachable("unexpected non-branch terminator inst");
3176 }
3177
3178 ++I;
3179 }
3180
3181 if (I == E)
3182 return false;
3183
3184 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3185}
3186
3188 int *BytesRemoved) const {
3189 unsigned Count = 0;
3190 unsigned RemovedSize = 0;
3191 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3192 // Skip over artificial terminators when removing instructions.
3193 if (MI.isBranch() || MI.isReturn()) {
3194 RemovedSize += getInstSizeInBytes(MI);
3195 MI.eraseFromParent();
3196 ++Count;
3197 }
3198 }
3199
3200 if (BytesRemoved)
3201 *BytesRemoved = RemovedSize;
3202
3203 return Count;
3204}
3205
3206// Copy the flags onto the implicit condition register operand.
3208 const MachineOperand &OrigCond) {
3209 CondReg.setIsUndef(OrigCond.isUndef());
3210 CondReg.setIsKill(OrigCond.isKill());
3211}
3212
3215 MachineBasicBlock *FBB,
3217 const DebugLoc &DL,
3218 int *BytesAdded) const {
3219 if (!FBB && Cond.empty()) {
3220 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3221 .addMBB(TBB);
3222 if (BytesAdded)
3223 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3224 return 1;
3225 }
3226
3227 assert(TBB && Cond[0].isImm());
3228
3229 unsigned Opcode
3230 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3231
3232 if (!FBB) {
3233 MachineInstr *CondBr =
3234 BuildMI(&MBB, DL, get(Opcode))
3235 .addMBB(TBB);
3236
3237 // Copy the flags onto the implicit condition register operand.
3238 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3239 fixImplicitOperands(*CondBr);
3240
3241 if (BytesAdded)
3242 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3243 return 1;
3244 }
3245
3246 assert(TBB && FBB);
3247
3248 MachineInstr *CondBr =
3249 BuildMI(&MBB, DL, get(Opcode))
3250 .addMBB(TBB);
3251 fixImplicitOperands(*CondBr);
3252 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3253 .addMBB(FBB);
3254
3255 MachineOperand &CondReg = CondBr->getOperand(1);
3256 CondReg.setIsUndef(Cond[1].isUndef());
3257 CondReg.setIsKill(Cond[1].isKill());
3258
3259 if (BytesAdded)
3260 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3261
3262 return 2;
3263}
3264
3267 if (Cond.size() != 2) {
3268 return true;
3269 }
3270
3271 if (Cond[0].isImm()) {
3272 Cond[0].setImm(-Cond[0].getImm());
3273 return false;
3274 }
3275
3276 return true;
3277}
3278
3281 Register DstReg, Register TrueReg,
3282 Register FalseReg, int &CondCycles,
3283 int &TrueCycles, int &FalseCycles) const {
3284 switch (Cond[0].getImm()) {
3285 case VCCNZ:
3286 case VCCZ: {
3287 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3288 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3289 if (MRI.getRegClass(FalseReg) != RC)
3290 return false;
3291
3292 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3293 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3294
3295 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3296 return RI.hasVGPRs(RC) && NumInsts <= 6;
3297 }
3298 case SCC_TRUE:
3299 case SCC_FALSE: {
3300 // FIXME: We could insert for VGPRs if we could replace the original compare
3301 // with a vector one.
3302 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3303 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3304 if (MRI.getRegClass(FalseReg) != RC)
3305 return false;
3306
3307 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3308
3309 // Multiples of 8 can do s_cselect_b64
3310 if (NumInsts % 2 == 0)
3311 NumInsts /= 2;
3312
3313 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3314 return RI.isSGPRClass(RC);
3315 }
3316 default:
3317 return false;
3318 }
3319}
3320
3324 Register TrueReg, Register FalseReg) const {
3325 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3326 if (Pred == VCCZ || Pred == SCC_FALSE) {
3327 Pred = static_cast<BranchPredicate>(-Pred);
3328 std::swap(TrueReg, FalseReg);
3329 }
3330
3331 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3332 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3333 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3334
3335 if (DstSize == 32) {
3337 if (Pred == SCC_TRUE) {
3338 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3339 .addReg(TrueReg)
3340 .addReg(FalseReg);
3341 } else {
3342 // Instruction's operands are backwards from what is expected.
3343 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3344 .addReg(FalseReg)
3345 .addReg(TrueReg);
3346 }
3347
3348 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3349 return;
3350 }
3351
3352 if (DstSize == 64 && Pred == SCC_TRUE) {
3354 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3355 .addReg(TrueReg)
3356 .addReg(FalseReg);
3357
3358 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3359 return;
3360 }
3361
3362 static const int16_t Sub0_15[] = {
3363 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3364 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3365 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3366 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3367 };
3368
3369 static const int16_t Sub0_15_64[] = {
3370 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3371 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3372 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3373 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3374 };
3375
3376 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3377 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3378 const int16_t *SubIndices = Sub0_15;
3379 int NElts = DstSize / 32;
3380
3381 // 64-bit select is only available for SALU.
3382 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3383 if (Pred == SCC_TRUE) {
3384 if (NElts % 2) {
3385 SelOp = AMDGPU::S_CSELECT_B32;
3386 EltRC = &AMDGPU::SGPR_32RegClass;
3387 } else {
3388 SelOp = AMDGPU::S_CSELECT_B64;
3389 EltRC = &AMDGPU::SGPR_64RegClass;
3390 SubIndices = Sub0_15_64;
3391 NElts /= 2;
3392 }
3393 }
3394
3396 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3397
3398 I = MIB->getIterator();
3399
3401 for (int Idx = 0; Idx != NElts; ++Idx) {
3402 Register DstElt = MRI.createVirtualRegister(EltRC);
3403 Regs.push_back(DstElt);
3404
3405 unsigned SubIdx = SubIndices[Idx];
3406
3408 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3409 Select =
3410 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3411 .addReg(FalseReg, 0, SubIdx)
3412 .addReg(TrueReg, 0, SubIdx);
3413 } else {
3414 Select =
3415 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3416 .addReg(TrueReg, 0, SubIdx)
3417 .addReg(FalseReg, 0, SubIdx);
3418 }
3419
3420 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3422
3423 MIB.addReg(DstElt)
3424 .addImm(SubIdx);
3425 }
3426}
3427
3429 switch (MI.getOpcode()) {
3430 case AMDGPU::V_MOV_B16_t16_e32:
3431 case AMDGPU::V_MOV_B16_t16_e64:
3432 case AMDGPU::V_MOV_B32_e32:
3433 case AMDGPU::V_MOV_B32_e64:
3434 case AMDGPU::V_MOV_B64_PSEUDO:
3435 case AMDGPU::V_MOV_B64_e32:
3436 case AMDGPU::V_MOV_B64_e64:
3437 case AMDGPU::S_MOV_B32:
3438 case AMDGPU::S_MOV_B64:
3439 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3440 case AMDGPU::COPY:
3441 case AMDGPU::WWM_COPY:
3442 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3443 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3444 case AMDGPU::V_ACCVGPR_MOV_B32:
3445 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3446 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3447 return true;
3448 default:
3449 return false;
3450 }
3451}
3452
3453static constexpr AMDGPU::OpName ModifierOpNames[] = {
3454 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3455 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3456 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3457
3459 unsigned Opc = MI.getOpcode();
3460 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3461 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3462 if (Idx >= 0)
3463 MI.removeOperand(Idx);
3464 }
3465}
3466
3467std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3468 unsigned SubRegIndex) {
3469 switch (SubRegIndex) {
3470 case AMDGPU::NoSubRegister:
3471 return Imm;
3472 case AMDGPU::sub0:
3473 return SignExtend64<32>(Imm);
3474 case AMDGPU::sub1:
3475 return SignExtend64<32>(Imm >> 32);
3476 case AMDGPU::lo16:
3477 return SignExtend64<16>(Imm);
3478 case AMDGPU::hi16:
3479 return SignExtend64<16>(Imm >> 16);
3480 case AMDGPU::sub1_lo16:
3481 return SignExtend64<16>(Imm >> 32);
3482 case AMDGPU::sub1_hi16:
3483 return SignExtend64<16>(Imm >> 48);
3484 default:
3485 return std::nullopt;
3486 }
3487
3488 llvm_unreachable("covered subregister switch");
3489}
3490
3491static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3492 switch (Opc) {
3493 case AMDGPU::V_MAC_F16_e32:
3494 case AMDGPU::V_MAC_F16_e64:
3495 case AMDGPU::V_MAD_F16_e64:
3496 return AMDGPU::V_MADAK_F16;
3497 case AMDGPU::V_MAC_F32_e32:
3498 case AMDGPU::V_MAC_F32_e64:
3499 case AMDGPU::V_MAD_F32_e64:
3500 return AMDGPU::V_MADAK_F32;
3501 case AMDGPU::V_FMAC_F32_e32:
3502 case AMDGPU::V_FMAC_F32_e64:
3503 case AMDGPU::V_FMA_F32_e64:
3504 return AMDGPU::V_FMAAK_F32;
3505 case AMDGPU::V_FMAC_F16_e32:
3506 case AMDGPU::V_FMAC_F16_e64:
3507 case AMDGPU::V_FMAC_F16_t16_e64:
3508 case AMDGPU::V_FMAC_F16_fake16_e64:
3509 case AMDGPU::V_FMA_F16_e64:
3510 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3511 ? AMDGPU::V_FMAAK_F16_t16
3512 : AMDGPU::V_FMAAK_F16_fake16
3513 : AMDGPU::V_FMAAK_F16;
3514 case AMDGPU::V_FMAC_F64_e32:
3515 case AMDGPU::V_FMAC_F64_e64:
3516 case AMDGPU::V_FMA_F64_e64:
3517 return AMDGPU::V_FMAAK_F64;
3518 default:
3519 llvm_unreachable("invalid instruction");
3520 }
3521}
3522
3523static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3524 switch (Opc) {
3525 case AMDGPU::V_MAC_F16_e32:
3526 case AMDGPU::V_MAC_F16_e64:
3527 case AMDGPU::V_MAD_F16_e64:
3528 return AMDGPU::V_MADMK_F16;
3529 case AMDGPU::V_MAC_F32_e32:
3530 case AMDGPU::V_MAC_F32_e64:
3531 case AMDGPU::V_MAD_F32_e64:
3532 return AMDGPU::V_MADMK_F32;
3533 case AMDGPU::V_FMAC_F32_e32:
3534 case AMDGPU::V_FMAC_F32_e64:
3535 case AMDGPU::V_FMA_F32_e64:
3536 return AMDGPU::V_FMAMK_F32;
3537 case AMDGPU::V_FMAC_F16_e32:
3538 case AMDGPU::V_FMAC_F16_e64:
3539 case AMDGPU::V_FMAC_F16_t16_e64:
3540 case AMDGPU::V_FMAC_F16_fake16_e64:
3541 case AMDGPU::V_FMA_F16_e64:
3542 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3543 ? AMDGPU::V_FMAMK_F16_t16
3544 : AMDGPU::V_FMAMK_F16_fake16
3545 : AMDGPU::V_FMAMK_F16;
3546 case AMDGPU::V_FMAC_F64_e32:
3547 case AMDGPU::V_FMAC_F64_e64:
3548 case AMDGPU::V_FMA_F64_e64:
3549 return AMDGPU::V_FMAMK_F64;
3550 default:
3551 llvm_unreachable("invalid instruction");
3552 }
3553}
3554
3556 Register Reg, MachineRegisterInfo *MRI) const {
3557 int64_t Imm;
3558 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3559 return false;
3560
3561 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3562
3563 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3564
3565 unsigned Opc = UseMI.getOpcode();
3566 if (Opc == AMDGPU::COPY) {
3567 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3568
3569 Register DstReg = UseMI.getOperand(0).getReg();
3570 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3571
3572 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3573
3574 if (HasMultipleUses) {
3575 // TODO: This should fold in more cases with multiple use, but we need to
3576 // more carefully consider what those uses are.
3577 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3578
3579 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3580 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3581 return false;
3582
3583 // Most of the time folding a 32-bit inline constant is free (though this
3584 // might not be true if we can't later fold it into a real user).
3585 //
3586 // FIXME: This isInlineConstant check is imprecise if
3587 // getConstValDefinedInReg handled the tricky non-mov cases.
3588 if (ImmDefSize == 32 &&
3590 return false;
3591 }
3592
3593 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3594 RI.getSubRegIdxSize(UseSubReg) == 16;
3595
3596 if (Is16Bit) {
3597 if (RI.hasVGPRs(DstRC))
3598 return false; // Do not clobber vgpr_hi16
3599
3600 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3601 return false;
3602 }
3603
3604 MachineFunction *MF = UseMI.getMF();
3605
3606 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3607 MCRegister MovDstPhysReg =
3608 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3609
3610 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3611
3612 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3613 for (unsigned MovOp :
3614 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3615 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3616 const MCInstrDesc &MovDesc = get(MovOp);
3617
3618 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI, *MF);
3619 if (Is16Bit) {
3620 // We just need to find a correctly sized register class, so the
3621 // subregister index compatibility doesn't matter since we're statically
3622 // extracting the immediate value.
3623 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3624 if (!MovDstRC)
3625 continue;
3626
3627 if (MovDstPhysReg) {
3628 // FIXME: We probably should not do this. If there is a live value in
3629 // the high half of the register, it will be corrupted.
3630 MovDstPhysReg =
3631 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3632 if (!MovDstPhysReg)
3633 continue;
3634 }
3635 }
3636
3637 // Result class isn't the right size, try the next instruction.
3638 if (MovDstPhysReg) {
3639 if (!MovDstRC->contains(MovDstPhysReg))
3640 return false;
3641 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3642 // TODO: This will be overly conservative in the case of 16-bit virtual
3643 // SGPRs. We could hack up the virtual register uses to use a compatible
3644 // 32-bit class.
3645 continue;
3646 }
3647
3648 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3649
3650 // Ensure the interpreted immediate value is a valid operand in the new
3651 // mov.
3652 //
3653 // FIXME: isImmOperandLegal should have form that doesn't require existing
3654 // MachineInstr or MachineOperand
3655 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3656 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3657 break;
3658
3659 NewOpc = MovOp;
3660 break;
3661 }
3662
3663 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3664 return false;
3665
3666 if (Is16Bit) {
3667 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3668 if (MovDstPhysReg)
3669 UseMI.getOperand(0).setReg(MovDstPhysReg);
3670 assert(UseMI.getOperand(1).getReg().isVirtual());
3671 }
3672
3673 const MCInstrDesc &NewMCID = get(NewOpc);
3674 UseMI.setDesc(NewMCID);
3675 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3676 UseMI.addImplicitDefUseOperands(*MF);
3677 return true;
3678 }
3679
3680 if (HasMultipleUses)
3681 return false;
3682
3683 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3684 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3685 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3686 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3687 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3688 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3689 Opc == AMDGPU::V_FMAC_F64_e64) {
3690 // Don't fold if we are using source or output modifiers. The new VOP2
3691 // instructions don't have them.
3693 return false;
3694
3695 // If this is a free constant, there's no reason to do this.
3696 // TODO: We could fold this here instead of letting SIFoldOperands do it
3697 // later.
3698 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3699
3700 // Any src operand can be used for the legality check.
3701 if (isInlineConstant(UseMI, Src0Idx, Imm))
3702 return false;
3703
3704 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3705
3706 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3707 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3708
3709 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3710 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3711 (Src1->isReg() && Src1->getReg() == Reg)) {
3712 MachineOperand *RegSrc =
3713 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3714 if (!RegSrc->isReg())
3715 return false;
3716 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3717 ST.getConstantBusLimit(Opc) < 2)
3718 return false;
3719
3720 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3721 return false;
3722
3723 // If src2 is also a literal constant then we have to choose which one to
3724 // fold. In general it is better to choose madak so that the other literal
3725 // can be materialized in an sgpr instead of a vgpr:
3726 // s_mov_b32 s0, literal
3727 // v_madak_f32 v0, s0, v0, literal
3728 // Instead of:
3729 // v_mov_b32 v1, literal
3730 // v_madmk_f32 v0, v0, literal, v1
3731 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3732 if (Def && Def->isMoveImmediate() &&
3733 !isInlineConstant(Def->getOperand(1)))
3734 return false;
3735
3736 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3737 if (pseudoToMCOpcode(NewOpc) == -1)
3738 return false;
3739
3740 // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3741 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3742 // restricting their register classes. For now just bail out.
3743 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3744 NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3745 return false;
3746
3747 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3748 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3749
3750 // FIXME: This would be a lot easier if we could return a new instruction
3751 // instead of having to modify in place.
3752
3753 Register SrcReg = RegSrc->getReg();
3754 unsigned SrcSubReg = RegSrc->getSubReg();
3755 Src0->setReg(SrcReg);
3756 Src0->setSubReg(SrcSubReg);
3757 Src0->setIsKill(RegSrc->isKill());
3758
3759 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3760 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3761 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3762 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3763 UseMI.untieRegOperand(
3764 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3765
3766 Src1->ChangeToImmediate(*SubRegImm);
3767
3769 UseMI.setDesc(get(NewOpc));
3770
3771 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3772 if (DeleteDef)
3773 DefMI.eraseFromParent();
3774
3775 return true;
3776 }
3777
3778 // Added part is the constant: Use v_madak_{f16, f32}.
3779 if (Src2->isReg() && Src2->getReg() == Reg) {
3780 if (ST.getConstantBusLimit(Opc) < 2) {
3781 // Not allowed to use constant bus for another operand.
3782 // We can however allow an inline immediate as src0.
3783 bool Src0Inlined = false;
3784 if (Src0->isReg()) {
3785 // Try to inline constant if possible.
3786 // If the Def moves immediate and the use is single
3787 // We are saving VGPR here.
3788 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3789 if (Def && Def->isMoveImmediate() &&
3790 isInlineConstant(Def->getOperand(1)) &&
3791 MRI->hasOneUse(Src0->getReg())) {
3792 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3793 Src0Inlined = true;
3794 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3795 RI.isSGPRReg(*MRI, Src0->getReg())) {
3796 return false;
3797 }
3798 // VGPR is okay as Src0 - fallthrough
3799 }
3800
3801 if (Src1->isReg() && !Src0Inlined) {
3802 // We have one slot for inlinable constant so far - try to fill it
3803 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3804 if (Def && Def->isMoveImmediate() &&
3805 isInlineConstant(Def->getOperand(1)) &&
3806 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3807 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3808 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3809 return false;
3810 // VGPR is okay as Src1 - fallthrough
3811 }
3812 }
3813
3814 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3815 if (pseudoToMCOpcode(NewOpc) == -1)
3816 return false;
3817
3818 // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3819 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3820 // restricting their register classes. For now just bail out.
3821 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3822 NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3823 return false;
3824
3825 // FIXME: This would be a lot easier if we could return a new instruction
3826 // instead of having to modify in place.
3827
3828 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3829 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3830 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3831 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3832 UseMI.untieRegOperand(
3833 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3834
3835 const std::optional<int64_t> SubRegImm =
3836 extractSubregFromImm(Imm, Src2->getSubReg());
3837
3838 // ChangingToImmediate adds Src2 back to the instruction.
3839 Src2->ChangeToImmediate(*SubRegImm);
3840
3841 // These come before src2.
3843 UseMI.setDesc(get(NewOpc));
3844 // It might happen that UseMI was commuted
3845 // and we now have SGPR as SRC1. If so 2 inlined
3846 // constant and SGPR are illegal.
3848
3849 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3850 if (DeleteDef)
3851 DefMI.eraseFromParent();
3852
3853 return true;
3854 }
3855 }
3856
3857 return false;
3858}
3859
3860static bool
3863 if (BaseOps1.size() != BaseOps2.size())
3864 return false;
3865 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3866 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3867 return false;
3868 }
3869 return true;
3870}
3871
3872static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3873 LocationSize WidthB, int OffsetB) {
3874 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3875 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3876 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3877 return LowWidth.hasValue() &&
3878 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3879}
3880
3881bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3882 const MachineInstr &MIb) const {
3883 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3884 int64_t Offset0, Offset1;
3885 LocationSize Dummy0 = LocationSize::precise(0);
3886 LocationSize Dummy1 = LocationSize::precise(0);
3887 bool Offset0IsScalable, Offset1IsScalable;
3888 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3889 Dummy0, &RI) ||
3890 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3891 Dummy1, &RI))
3892 return false;
3893
3894 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3895 return false;
3896
3897 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3898 // FIXME: Handle ds_read2 / ds_write2.
3899 return false;
3900 }
3901 LocationSize Width0 = MIa.memoperands().front()->getSize();
3902 LocationSize Width1 = MIb.memoperands().front()->getSize();
3903 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3904}
3905
3907 const MachineInstr &MIb) const {
3908 assert(MIa.mayLoadOrStore() &&
3909 "MIa must load from or modify a memory location");
3910 assert(MIb.mayLoadOrStore() &&
3911 "MIb must load from or modify a memory location");
3912
3914 return false;
3915
3916 // XXX - Can we relax this between address spaces?
3917 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3918 return false;
3919
3920 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3921 return false;
3922
3923 // TODO: Should we check the address space from the MachineMemOperand? That
3924 // would allow us to distinguish objects we know don't alias based on the
3925 // underlying address space, even if it was lowered to a different one,
3926 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3927 // buffer.
3928 if (isDS(MIa)) {
3929 if (isDS(MIb))
3930 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3931
3932 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3933 }
3934
3935 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3936 if (isMUBUF(MIb) || isMTBUF(MIb))
3937 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3938
3939 if (isFLAT(MIb))
3940 return isFLATScratch(MIb);
3941
3942 return !isSMRD(MIb);
3943 }
3944
3945 if (isSMRD(MIa)) {
3946 if (isSMRD(MIb))
3947 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3948
3949 if (isFLAT(MIb))
3950 return isFLATScratch(MIb);
3951
3952 return !isMUBUF(MIb) && !isMTBUF(MIb);
3953 }
3954
3955 if (isFLAT(MIa)) {
3956 if (isFLAT(MIb)) {
3957 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3958 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3959 return true;
3960
3961 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3962 }
3963
3964 return false;
3965 }
3966
3967 return false;
3968}
3969
3971 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3972 if (Reg.isPhysical())
3973 return false;
3974 auto *Def = MRI.getUniqueVRegDef(Reg);
3975 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3976 Imm = Def->getOperand(1).getImm();
3977 if (DefMI)
3978 *DefMI = Def;
3979 return true;
3980 }
3981 return false;
3982}
3983
3984static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3985 MachineInstr **DefMI = nullptr) {
3986 if (!MO->isReg())
3987 return false;
3988 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3989 const MachineRegisterInfo &MRI = MF->getRegInfo();
3990 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3991}
3992
3994 MachineInstr &NewMI) {
3995 if (LV) {
3996 unsigned NumOps = MI.getNumOperands();
3997 for (unsigned I = 1; I < NumOps; ++I) {
3998 MachineOperand &Op = MI.getOperand(I);
3999 if (Op.isReg() && Op.isKill())
4000 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4001 }
4002 }
4003}
4004
4005static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4006 switch (Opc) {
4007 case AMDGPU::V_MAC_F16_e32:
4008 case AMDGPU::V_MAC_F16_e64:
4009 return AMDGPU::V_MAD_F16_e64;
4010 case AMDGPU::V_MAC_F32_e32:
4011 case AMDGPU::V_MAC_F32_e64:
4012 return AMDGPU::V_MAD_F32_e64;
4013 case AMDGPU::V_MAC_LEGACY_F32_e32:
4014 case AMDGPU::V_MAC_LEGACY_F32_e64:
4015 return AMDGPU::V_MAD_LEGACY_F32_e64;
4016 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4017 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4018 return AMDGPU::V_FMA_LEGACY_F32_e64;
4019 case AMDGPU::V_FMAC_F16_e32:
4020 case AMDGPU::V_FMAC_F16_e64:
4021 case AMDGPU::V_FMAC_F16_t16_e64:
4022 case AMDGPU::V_FMAC_F16_fake16_e64:
4023 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4024 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4025 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4026 : AMDGPU::V_FMA_F16_gfx9_e64;
4027 case AMDGPU::V_FMAC_F32_e32:
4028 case AMDGPU::V_FMAC_F32_e64:
4029 return AMDGPU::V_FMA_F32_e64;
4030 case AMDGPU::V_FMAC_F64_e32:
4031 case AMDGPU::V_FMAC_F64_e64:
4032 return AMDGPU::V_FMA_F64_e64;
4033 default:
4034 llvm_unreachable("invalid instruction");
4035 }
4036}
4037
4039 LiveVariables *LV,
4040 LiveIntervals *LIS) const {
4041 MachineBasicBlock &MBB = *MI.getParent();
4042 unsigned Opc = MI.getOpcode();
4043
4044 // Handle MFMA.
4045 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4046 if (NewMFMAOpc != -1) {
4048 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4049 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4050 MIB.add(MI.getOperand(I));
4051 updateLiveVariables(LV, MI, *MIB);
4052 if (LIS) {
4053 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4054 // SlotIndex of defs needs to be updated when converting to early-clobber
4055 MachineOperand &Def = MIB->getOperand(0);
4056 if (Def.isEarlyClobber() && Def.isReg() &&
4057 LIS->hasInterval(Def.getReg())) {
4058 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
4059 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
4060 auto &LI = LIS->getInterval(Def.getReg());
4061 auto UpdateDefIndex = [&](LiveRange &LR) {
4062 auto *S = LR.find(OldIndex);
4063 if (S != LR.end() && S->start == OldIndex) {
4064 assert(S->valno && S->valno->def == OldIndex);
4065 S->start = NewIndex;
4066 S->valno->def = NewIndex;
4067 }
4068 };
4069 UpdateDefIndex(LI);
4070 for (auto &SR : LI.subranges())
4071 UpdateDefIndex(SR);
4072 }
4073 }
4074 return MIB;
4075 }
4076
4077 if (SIInstrInfo::isWMMA(MI)) {
4078 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4079 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4080 .setMIFlags(MI.getFlags());
4081 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4082 MIB->addOperand(MI.getOperand(I));
4083
4084 updateLiveVariables(LV, MI, *MIB);
4085 if (LIS)
4086 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4087
4088 return MIB;
4089 }
4090
4091 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4092 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4093 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4094 "present pre-RA");
4095
4096 // Handle MAC/FMAC.
4097 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4098 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4099 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4100 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4101 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4102 bool Src0Literal = false;
4103
4104 switch (Opc) {
4105 default:
4106 return nullptr;
4107 case AMDGPU::V_MAC_F16_e64:
4108 case AMDGPU::V_FMAC_F16_e64:
4109 case AMDGPU::V_FMAC_F16_t16_e64:
4110 case AMDGPU::V_FMAC_F16_fake16_e64:
4111 case AMDGPU::V_MAC_F32_e64:
4112 case AMDGPU::V_MAC_LEGACY_F32_e64:
4113 case AMDGPU::V_FMAC_F32_e64:
4114 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4115 case AMDGPU::V_FMAC_F64_e64:
4116 break;
4117 case AMDGPU::V_MAC_F16_e32:
4118 case AMDGPU::V_FMAC_F16_e32:
4119 case AMDGPU::V_MAC_F32_e32:
4120 case AMDGPU::V_MAC_LEGACY_F32_e32:
4121 case AMDGPU::V_FMAC_F32_e32:
4122 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4123 case AMDGPU::V_FMAC_F64_e32: {
4124 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4125 AMDGPU::OpName::src0);
4126 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4127 if (!Src0->isReg() && !Src0->isImm())
4128 return nullptr;
4129
4130 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4131 Src0Literal = true;
4132
4133 break;
4134 }
4135 }
4136
4138 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4139 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4140 const MachineOperand *Src0Mods =
4141 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4142 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4143 const MachineOperand *Src1Mods =
4144 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4145 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4146 const MachineOperand *Src2Mods =
4147 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4148 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4149 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4150 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4151
4152 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4153 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4154 // If we have an SGPR input, we will violate the constant bus restriction.
4155 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4156 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4158 const auto killDef = [&]() -> void {
4159 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4160 // The only user is the instruction which will be killed.
4161 Register DefReg = DefMI->getOperand(0).getReg();
4162
4163 if (MRI.hasOneNonDBGUse(DefReg)) {
4164 // We cannot just remove the DefMI here, calling pass will crash.
4165 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
4166 DefMI->getOperand(0).setIsDead(true);
4167 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
4168 DefMI->removeOperand(I);
4169 if (LV)
4170 LV->getVarInfo(DefReg).AliveBlocks.clear();
4171 }
4172
4173 if (LIS) {
4174 LiveInterval &DefLI = LIS->getInterval(DefReg);
4175
4176 // We cannot delete the original instruction here, so hack out the use
4177 // in the original instruction with a dummy register so we can use
4178 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4179 // not have the complexity of deleting a use to consider here.
4180 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4181 for (MachineOperand &MIOp : MI.uses()) {
4182 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4183 MIOp.setIsUndef(true);
4184 MIOp.setReg(DummyReg);
4185 }
4186 }
4187
4188 LIS->shrinkToUses(&DefLI);
4189 }
4190 };
4191
4192 int64_t Imm;
4193 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4194 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4195 if (pseudoToMCOpcode(NewOpc) != -1) {
4196 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4197 .add(*Dst)
4198 .add(*Src0)
4199 .add(*Src1)
4200 .addImm(Imm)
4201 .setMIFlags(MI.getFlags());
4202 updateLiveVariables(LV, MI, *MIB);
4203 if (LIS)
4204 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4205 killDef();
4206 return MIB;
4207 }
4208 }
4209 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4210 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4211 if (pseudoToMCOpcode(NewOpc) != -1) {
4212 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4213 .add(*Dst)
4214 .add(*Src0)
4215 .addImm(Imm)
4216 .add(*Src2)
4217 .setMIFlags(MI.getFlags());
4218 updateLiveVariables(LV, MI, *MIB);
4219
4220 if (LIS)
4221 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4222 killDef();
4223 return MIB;
4224 }
4225 }
4226 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4227 if (Src0Literal) {
4228 Imm = Src0->getImm();
4229 DefMI = nullptr;
4230 }
4231 if (pseudoToMCOpcode(NewOpc) != -1 &&
4233 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4234 Src1)) {
4235 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4236 .add(*Dst)
4237 .add(*Src1)
4238 .addImm(Imm)
4239 .add(*Src2)
4240 .setMIFlags(MI.getFlags());
4241 updateLiveVariables(LV, MI, *MIB);
4242
4243 if (LIS)
4244 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4245 if (DefMI)
4246 killDef();
4247 return MIB;
4248 }
4249 }
4250 }
4251
4252 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4253 // if VOP3 does not allow a literal operand.
4254 if (Src0Literal && !ST.hasVOP3Literal())
4255 return nullptr;
4256
4257 unsigned NewOpc = getNewFMAInst(ST, Opc);
4258
4259 if (pseudoToMCOpcode(NewOpc) == -1)
4260 return nullptr;
4261
4262 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4263 .add(*Dst)
4264 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4265 .add(*Src0)
4266 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4267 .add(*Src1)
4268 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4269 .add(*Src2)
4270 .addImm(Clamp ? Clamp->getImm() : 0)
4271 .addImm(Omod ? Omod->getImm() : 0)
4272 .setMIFlags(MI.getFlags());
4273 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4274 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4275 updateLiveVariables(LV, MI, *MIB);
4276 if (LIS)
4277 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4278 return MIB;
4279}
4280
4281// It's not generally safe to move VALU instructions across these since it will
4282// start using the register as a base index rather than directly.
4283// XXX - Why isn't hasSideEffects sufficient for these?
4285 switch (MI.getOpcode()) {
4286 case AMDGPU::S_SET_GPR_IDX_ON:
4287 case AMDGPU::S_SET_GPR_IDX_MODE:
4288 case AMDGPU::S_SET_GPR_IDX_OFF:
4289 return true;
4290 default:
4291 return false;
4292 }
4293}
4294
4296 const MachineBasicBlock *MBB,
4297 const MachineFunction &MF) const {
4298 // Skipping the check for SP writes in the base implementation. The reason it
4299 // was added was apparently due to compile time concerns.
4300 //
4301 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4302 // but is probably avoidable.
4303
4304 // Copied from base implementation.
4305 // Terminators and labels can't be scheduled around.
4306 if (MI.isTerminator() || MI.isPosition())
4307 return true;
4308
4309 // INLINEASM_BR can jump to another block
4310 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4311 return true;
4312
4313 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4314 return true;
4315
4316 // Target-independent instructions do not have an implicit-use of EXEC, even
4317 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4318 // boundaries prevents incorrect movements of such instructions.
4319 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4320 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4321 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4322 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4323 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4325}
4326
4328 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4329 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4330 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4331}
4332
4334 if (!isFLAT(MI) || isFLATGlobal(MI))
4335 return false;
4336
4337 // If scratch is not initialized, we can never access it.
4338 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4339 return false;
4340
4341 // SCRATCH instructions always access scratch.
4342 if (isFLATScratch(MI))
4343 return true;
4344
4345 // If there are no memory operands then conservatively assume the flat
4346 // operation may access scratch.
4347 if (MI.memoperands_empty())
4348 return true;
4349
4350 // See if any memory operand specifies an address space that involves scratch.
4351 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4352 unsigned AS = Memop->getAddrSpace();
4353 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4354 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4355 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4356 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4357 }
4358 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4359 });
4360}
4361
4363 // Skip the full operand and register alias search modifiesRegister
4364 // does. There's only a handful of instructions that touch this, it's only an
4365 // implicit def, and doesn't alias any other registers.
4366 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4367}
4368
4370 unsigned Opcode = MI.getOpcode();
4371
4372 if (MI.mayStore() && isSMRD(MI))
4373 return true; // scalar store or atomic
4374
4375 // This will terminate the function when other lanes may need to continue.
4376 if (MI.isReturn())
4377 return true;
4378
4379 // These instructions cause shader I/O that may cause hardware lockups
4380 // when executed with an empty EXEC mask.
4381 //
4382 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4383 // EXEC = 0, but checking for that case here seems not worth it
4384 // given the typical code patterns.
4385 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4386 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4387 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4388 return true;
4389
4390 if (MI.isCall() || MI.isInlineAsm())
4391 return true; // conservative assumption
4392
4393 // Assume that barrier interactions are only intended with active lanes.
4394 if (isBarrier(Opcode))
4395 return true;
4396
4397 // A mode change is a scalar operation that influences vector instructions.
4399 return true;
4400
4401 // These are like SALU instructions in terms of effects, so it's questionable
4402 // whether we should return true for those.
4403 //
4404 // However, executing them with EXEC = 0 causes them to operate on undefined
4405 // data, which we avoid by returning true here.
4406 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4407 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4408 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4409 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4410 return true;
4411
4412 return false;
4413}
4414
4416 const MachineInstr &MI) const {
4417 if (MI.isMetaInstruction())
4418 return false;
4419
4420 // This won't read exec if this is an SGPR->SGPR copy.
4421 if (MI.isCopyLike()) {
4422 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4423 return true;
4424
4425 // Make sure this isn't copying exec as a normal operand
4426 return MI.readsRegister(AMDGPU::EXEC, &RI);
4427 }
4428
4429 // Make a conservative assumption about the callee.
4430 if (MI.isCall())
4431 return true;
4432
4433 // Be conservative with any unhandled generic opcodes.
4434 if (!isTargetSpecificOpcode(MI.getOpcode()))
4435 return true;
4436
4437 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4438}
4439
4440bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4441 switch (Imm.getBitWidth()) {
4442 case 1: // This likely will be a condition code mask.
4443 return true;
4444
4445 case 32:
4446 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4447 ST.hasInv2PiInlineImm());
4448 case 64:
4449 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4450 ST.hasInv2PiInlineImm());
4451 case 16:
4452 return ST.has16BitInsts() &&
4453 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4454 ST.hasInv2PiInlineImm());
4455 default:
4456 llvm_unreachable("invalid bitwidth");
4457 }
4458}
4459
4461 APInt IntImm = Imm.bitcastToAPInt();
4462 int64_t IntImmVal = IntImm.getSExtValue();
4463 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4464 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4465 default:
4466 llvm_unreachable("invalid fltSemantics");
4469 return isInlineConstant(IntImm);
4471 return ST.has16BitInsts() &&
4472 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4474 return ST.has16BitInsts() &&
4475 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4476 }
4477}
4478
4479bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4480 // MachineOperand provides no way to tell the true operand size, since it only
4481 // records a 64-bit value. We need to know the size to determine if a 32-bit
4482 // floating point immediate bit pattern is legal for an integer immediate. It
4483 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4484 switch (OperandType) {
4494 int32_t Trunc = static_cast<int32_t>(Imm);
4495 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4496 }
4502 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4505 // We would expect inline immediates to not be concerned with an integer/fp
4506 // distinction. However, in the case of 16-bit integer operations, the
4507 // "floating point" values appear to not work. It seems read the low 16-bits
4508 // of 32-bit immediates, which happens to always work for the integer
4509 // values.
4510 //
4511 // See llvm bugzilla 46302.
4512 //
4513 // TODO: Theoretically we could use op-sel to use the high bits of the
4514 // 32-bit FP values.
4526 return false;
4529 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4530 // A few special case instructions have 16-bit operands on subtargets
4531 // where 16-bit instructions are not legal.
4532 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4533 // constants in these cases
4534 int16_t Trunc = static_cast<int16_t>(Imm);
4535 return ST.has16BitInsts() &&
4536 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4537 }
4538
4539 return false;
4540 }
4543 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4544 int16_t Trunc = static_cast<int16_t>(Imm);
4545 return ST.has16BitInsts() &&
4546 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4547 }
4548 return false;
4549 }
4553 return false;
4555 return isLegalAV64PseudoImm(Imm);
4558 // Always embedded in the instruction for free.
4559 return true;
4569 // Just ignore anything else.
4570 return true;
4571 default:
4572 llvm_unreachable("invalid operand type");
4573 }
4574}
4575
4576static bool compareMachineOp(const MachineOperand &Op0,
4577 const MachineOperand &Op1) {
4578 if (Op0.getType() != Op1.getType())
4579 return false;
4580
4581 switch (Op0.getType()) {
4583 return Op0.getReg() == Op1.getReg();
4585 return Op0.getImm() == Op1.getImm();
4586 default:
4587 llvm_unreachable("Didn't expect to be comparing these operand types");
4588 }
4589}
4590
4592 const MCOperandInfo &OpInfo) const {
4593 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4594 return true;
4595
4596 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4597 return false;
4598
4599 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4600 return true;
4601
4602 return ST.hasVOP3Literal();
4603}
4604
4605bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4606 int64_t ImmVal) const {
4607 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4608 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4609 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4610 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4611 AMDGPU::OpName::src2))
4612 return false;
4613 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4614 }
4615
4616 return isLiteralOperandLegal(InstDesc, OpInfo);
4617}
4618
4619bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4620 const MachineOperand &MO) const {
4621 if (MO.isImm())
4622 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4623
4624 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4625 "unexpected imm-like operand kind");
4626 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4627 return isLiteralOperandLegal(InstDesc, OpInfo);
4628}
4629
4631 // 2 32-bit inline constants packed into one.
4632 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4633 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4634}
4635
4636bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4637 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4638 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4639 return false;
4640
4641 int Op32 = AMDGPU::getVOPe32(Opcode);
4642 if (Op32 == -1)
4643 return false;
4644
4645 return pseudoToMCOpcode(Op32) != -1;
4646}
4647
4648bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4649 // The src0_modifier operand is present on all instructions
4650 // that have modifiers.
4651
4652 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4653}
4654
4656 AMDGPU::OpName OpName) const {
4657 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4658 return Mods && Mods->getImm();
4659}
4660
4662 return any_of(ModifierOpNames,
4663 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4664}
4665
4667 const MachineRegisterInfo &MRI) const {
4668 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4669 // Can't shrink instruction with three operands.
4670 if (Src2) {
4671 switch (MI.getOpcode()) {
4672 default: return false;
4673
4674 case AMDGPU::V_ADDC_U32_e64:
4675 case AMDGPU::V_SUBB_U32_e64:
4676 case AMDGPU::V_SUBBREV_U32_e64: {
4677 const MachineOperand *Src1
4678 = getNamedOperand(MI, AMDGPU::OpName::src1);
4679 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4680 return false;
4681 // Additional verification is needed for sdst/src2.
4682 return true;
4683 }
4684 case AMDGPU::V_MAC_F16_e64:
4685 case AMDGPU::V_MAC_F32_e64:
4686 case AMDGPU::V_MAC_LEGACY_F32_e64:
4687 case AMDGPU::V_FMAC_F16_e64:
4688 case AMDGPU::V_FMAC_F16_t16_e64:
4689 case AMDGPU::V_FMAC_F16_fake16_e64:
4690 case AMDGPU::V_FMAC_F32_e64:
4691 case AMDGPU::V_FMAC_F64_e64:
4692 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4693 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4694 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4695 return false;
4696 break;
4697
4698 case AMDGPU::V_CNDMASK_B32_e64:
4699 break;
4700 }
4701 }
4702
4703 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4704 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4705 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4706 return false;
4707
4708 // We don't need to check src0, all input types are legal, so just make sure
4709 // src0 isn't using any modifiers.
4710 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4711 return false;
4712
4713 // Can it be shrunk to a valid 32 bit opcode?
4714 if (!hasVALU32BitEncoding(MI.getOpcode()))
4715 return false;
4716
4717 // Check output modifiers
4718 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4719 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4720 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4721 // TODO: Can we avoid checking bound_ctrl/fi here?
4722 // They are only used by permlane*_swap special case.
4723 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4724 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4725}
4726
4727// Set VCC operand with all flags from \p Orig, except for setting it as
4728// implicit.
4730 const MachineOperand &Orig) {
4731
4732 for (MachineOperand &Use : MI.implicit_operands()) {
4733 if (Use.isUse() &&
4734 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4735 Use.setIsUndef(Orig.isUndef());
4736 Use.setIsKill(Orig.isKill());
4737 return;
4738 }
4739 }
4740}
4741
4743 unsigned Op32) const {
4744 MachineBasicBlock *MBB = MI.getParent();
4745
4746 const MCInstrDesc &Op32Desc = get(Op32);
4747 MachineInstrBuilder Inst32 =
4748 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4749 .setMIFlags(MI.getFlags());
4750
4751 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4752 // For VOPC instructions, this is replaced by an implicit def of vcc.
4753
4754 // We assume the defs of the shrunk opcode are in the same order, and the
4755 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4756 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4757 Inst32.add(MI.getOperand(I));
4758
4759 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4760
4761 int Idx = MI.getNumExplicitDefs();
4762 for (const MachineOperand &Use : MI.explicit_uses()) {
4763 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4765 continue;
4766
4767 if (&Use == Src2) {
4768 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4769 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4770 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4771 // of vcc was already added during the initial BuildMI, but we
4772 // 1) may need to change vcc to vcc_lo to preserve the original register
4773 // 2) have to preserve the original flags.
4774 copyFlagsToImplicitVCC(*Inst32, *Src2);
4775 continue;
4776 }
4777 }
4778
4779 Inst32.add(Use);
4780 }
4781
4782 // FIXME: Losing implicit operands
4783 fixImplicitOperands(*Inst32);
4784 return Inst32;
4785}
4786
4788 // Null is free
4789 Register Reg = RegOp.getReg();
4790 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4791 return false;
4792
4793 // SGPRs use the constant bus
4794
4795 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
4796 // physical register operands should also count, except for exec.
4797 if (RegOp.isImplicit())
4798 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
4799
4800 // SGPRs use the constant bus
4801 return AMDGPU::SReg_32RegClass.contains(Reg) ||
4802 AMDGPU::SReg_64RegClass.contains(Reg);
4803}
4804
4806 const MachineRegisterInfo &MRI) const {
4807 Register Reg = RegOp.getReg();
4808 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4809 : physRegUsesConstantBus(RegOp);
4810}
4811
4813 const MachineOperand &MO,
4814 const MCOperandInfo &OpInfo) const {
4815 // Literal constants use the constant bus.
4816 if (!MO.isReg())
4817 return !isInlineConstant(MO, OpInfo);
4818
4819 Register Reg = MO.getReg();
4820 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4822}
4823
4825 for (const MachineOperand &MO : MI.implicit_operands()) {
4826 // We only care about reads.
4827 if (MO.isDef())
4828 continue;
4829
4830 switch (MO.getReg()) {
4831 case AMDGPU::VCC:
4832 case AMDGPU::VCC_LO:
4833 case AMDGPU::VCC_HI:
4834 case AMDGPU::M0:
4835 case AMDGPU::FLAT_SCR:
4836 return MO.getReg();
4837
4838 default:
4839 break;
4840 }
4841 }
4842
4843 return Register();
4844}
4845
4846static bool shouldReadExec(const MachineInstr &MI) {
4847 if (SIInstrInfo::isVALU(MI)) {
4848 switch (MI.getOpcode()) {
4849 case AMDGPU::V_READLANE_B32:
4850 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4851 case AMDGPU::V_WRITELANE_B32:
4852 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4853 return false;
4854 }
4855
4856 return true;
4857 }
4858
4859 if (MI.isPreISelOpcode() ||
4860 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4863 return false;
4864
4865 return true;
4866}
4867
4868static bool isRegOrFI(const MachineOperand &MO) {
4869 return MO.isReg() || MO.isFI();
4870}
4871
4872static bool isSubRegOf(const SIRegisterInfo &TRI,
4873 const MachineOperand &SuperVec,
4874 const MachineOperand &SubReg) {
4875 if (SubReg.getReg().isPhysical())
4876 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4877
4878 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4879 SubReg.getReg() == SuperVec.getReg();
4880}
4881
4882// Verify the illegal copy from vector register to SGPR for generic opcode COPY
4883bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4884 const MachineRegisterInfo &MRI,
4885 StringRef &ErrInfo) const {
4886 Register DstReg = MI.getOperand(0).getReg();
4887 Register SrcReg = MI.getOperand(1).getReg();
4888 // This is a check for copy from vector register to SGPR
4889 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
4890 ErrInfo = "illegal copy from vector register to SGPR";
4891 return false;
4892 }
4893 return true;
4894}
4895
4897 StringRef &ErrInfo) const {
4898 uint16_t Opcode = MI.getOpcode();
4899 const MachineFunction *MF = MI.getParent()->getParent();
4900 const MachineRegisterInfo &MRI = MF->getRegInfo();
4901
4902 // FIXME: At this point the COPY verify is done only for non-ssa forms.
4903 // Find a better property to recognize the point where instruction selection
4904 // is just done.
4905 // We can only enforce this check after SIFixSGPRCopies pass so that the
4906 // illegal copies are legalized and thereafter we don't expect a pass
4907 // inserting similar copies.
4908 if (!MRI.isSSA() && MI.isCopy())
4909 return verifyCopy(MI, MRI, ErrInfo);
4910
4911 if (SIInstrInfo::isGenericOpcode(Opcode))
4912 return true;
4913
4914 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4915 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4916 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4917 int Src3Idx = -1;
4918 if (Src0Idx == -1) {
4919 // VOPD V_DUAL_* instructions use different operand names.
4920 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4921 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4922 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4923 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4924 }
4925
4926 // Make sure the number of operands is correct.
4927 const MCInstrDesc &Desc = get(Opcode);
4928 if (!Desc.isVariadic() &&
4929 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4930 ErrInfo = "Instruction has wrong number of operands.";
4931 return false;
4932 }
4933
4934 if (MI.isInlineAsm()) {
4935 // Verify register classes for inlineasm constraints.
4936 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4937 I != E; ++I) {
4938 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4939 if (!RC)
4940 continue;
4941
4942 const MachineOperand &Op = MI.getOperand(I);
4943 if (!Op.isReg())
4944 continue;
4945
4946 Register Reg = Op.getReg();
4947 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4948 ErrInfo = "inlineasm operand has incorrect register class.";
4949 return false;
4950 }
4951 }
4952
4953 return true;
4954 }
4955
4956 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4957 ErrInfo = "missing memory operand from image instruction.";
4958 return false;
4959 }
4960
4961 // Make sure the register classes are correct.
4962 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4963 const MachineOperand &MO = MI.getOperand(i);
4964 if (MO.isFPImm()) {
4965 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4966 "all fp values to integers.";
4967 return false;
4968 }
4969
4970 int RegClass = Desc.operands()[i].RegClass;
4971
4972 const MCOperandInfo &OpInfo = Desc.operands()[i];
4973 switch (OpInfo.OperandType) {
4975 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4976 ErrInfo = "Illegal immediate value for operand.";
4977 return false;
4978 }
4979 break;
4992 break;
4994 break;
4995 break;
5009 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5010 ErrInfo = "Illegal immediate value for operand.";
5011 return false;
5012 }
5013 break;
5014 }
5016 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5017 ErrInfo = "Expected inline constant for operand.";
5018 return false;
5019 }
5020 break;
5024 break;
5029 // Check if this operand is an immediate.
5030 // FrameIndex operands will be replaced by immediates, so they are
5031 // allowed.
5032 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5033 ErrInfo = "Expected immediate, but got non-immediate";
5034 return false;
5035 }
5036 break;
5040 break;
5041 default:
5042 if (OpInfo.isGenericType())
5043 continue;
5044 break;
5045 }
5046
5047 if (!MO.isReg())
5048 continue;
5049 Register Reg = MO.getReg();
5050 if (!Reg)
5051 continue;
5052
5053 // FIXME: Ideally we would have separate instruction definitions with the
5054 // aligned register constraint.
5055 // FIXME: We do not verify inline asm operands, but custom inline asm
5056 // verification is broken anyway
5057 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
5058 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5059 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5060 if (const TargetRegisterClass *SubRC =
5061 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5062 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5063 if (RC)
5064 RC = SubRC;
5065 }
5066 }
5067
5068 // Check that this is the aligned version of the class.
5069 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5070 ErrInfo = "Subtarget requires even aligned vector registers";
5071 return false;
5072 }
5073 }
5074
5075 if (RegClass != -1) {
5076 if (Reg.isVirtual())
5077 continue;
5078
5079 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5080 if (!RC->contains(Reg)) {
5081 ErrInfo = "Operand has incorrect register class.";
5082 return false;
5083 }
5084 }
5085 }
5086
5087 // Verify SDWA
5088 if (isSDWA(MI)) {
5089 if (!ST.hasSDWA()) {
5090 ErrInfo = "SDWA is not supported on this target";
5091 return false;
5092 }
5093
5094 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5095 AMDGPU::OpName::dst_sel}) {
5096 const MachineOperand *MO = getNamedOperand(MI, Op);
5097 if (!MO)
5098 continue;
5099 int64_t Imm = MO->getImm();
5100 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5101 ErrInfo = "Invalid SDWA selection";
5102 return false;
5103 }
5104 }
5105
5106 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5107
5108 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5109 if (OpIdx == -1)
5110 continue;
5111 const MachineOperand &MO = MI.getOperand(OpIdx);
5112
5113 if (!ST.hasSDWAScalar()) {
5114 // Only VGPRS on VI
5115 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5116 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5117 return false;
5118 }
5119 } else {
5120 // No immediates on GFX9
5121 if (!MO.isReg()) {
5122 ErrInfo =
5123 "Only reg allowed as operands in SDWA instructions on GFX9+";
5124 return false;
5125 }
5126 }
5127 }
5128
5129 if (!ST.hasSDWAOmod()) {
5130 // No omod allowed on VI
5131 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5132 if (OMod != nullptr &&
5133 (!OMod->isImm() || OMod->getImm() != 0)) {
5134 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5135 return false;
5136 }
5137 }
5138
5139 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5140 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5141 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5142 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5143 const MachineOperand *Src0ModsMO =
5144 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5145 unsigned Mods = Src0ModsMO->getImm();
5146 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5147 Mods & SISrcMods::SEXT) {
5148 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5149 return false;
5150 }
5151 }
5152
5153 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5154 if (isVOPC(BasicOpcode)) {
5155 if (!ST.hasSDWASdst() && DstIdx != -1) {
5156 // Only vcc allowed as dst on VI for VOPC
5157 const MachineOperand &Dst = MI.getOperand(DstIdx);
5158 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5159 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5160 return false;
5161 }
5162 } else if (!ST.hasSDWAOutModsVOPC()) {
5163 // No clamp allowed on GFX9 for VOPC
5164 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5165 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5166 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5167 return false;
5168 }
5169
5170 // No omod allowed on GFX9 for VOPC
5171 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5172 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5173 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5174 return false;
5175 }
5176 }
5177 }
5178
5179 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5180 if (DstUnused && DstUnused->isImm() &&
5181 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5182 const MachineOperand &Dst = MI.getOperand(DstIdx);
5183 if (!Dst.isReg() || !Dst.isTied()) {
5184 ErrInfo = "Dst register should have tied register";
5185 return false;
5186 }
5187
5188 const MachineOperand &TiedMO =
5189 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5190 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5191 ErrInfo =
5192 "Dst register should be tied to implicit use of preserved register";
5193 return false;
5194 }
5195 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5196 ErrInfo = "Dst register should use same physical register as preserved";
5197 return false;
5198 }
5199 }
5200 }
5201
5202 // Verify MIMG / VIMAGE / VSAMPLE
5203 if (isImage(Opcode) && !MI.mayStore()) {
5204 // Ensure that the return type used is large enough for all the options
5205 // being used TFE/LWE require an extra result register.
5206 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5207 if (DMask) {
5208 uint64_t DMaskImm = DMask->getImm();
5209 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5210 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5211 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5212 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5213
5214 // Adjust for packed 16 bit values
5215 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5216 RegCount = divideCeil(RegCount, 2);
5217
5218 // Adjust if using LWE or TFE
5219 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5220 RegCount += 1;
5221
5222 const uint32_t DstIdx =
5223 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5224 const MachineOperand &Dst = MI.getOperand(DstIdx);
5225 if (Dst.isReg()) {
5226 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5227 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5228 if (RegCount > DstSize) {
5229 ErrInfo = "Image instruction returns too many registers for dst "
5230 "register class";
5231 return false;
5232 }
5233 }
5234 }
5235 }
5236
5237 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5238 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5239 unsigned ConstantBusCount = 0;
5240 bool UsesLiteral = false;
5241 const MachineOperand *LiteralVal = nullptr;
5242
5243 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5244 if (ImmIdx != -1) {
5245 ++ConstantBusCount;
5246 UsesLiteral = true;
5247 LiteralVal = &MI.getOperand(ImmIdx);
5248 }
5249
5250 SmallVector<Register, 2> SGPRsUsed;
5251 Register SGPRUsed;
5252
5253 // Only look at the true operands. Only a real operand can use the constant
5254 // bus, and we don't want to check pseudo-operands like the source modifier
5255 // flags.
5256 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5257 if (OpIdx == -1)
5258 continue;
5259 const MachineOperand &MO = MI.getOperand(OpIdx);
5260 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5261 if (MO.isReg()) {
5262 SGPRUsed = MO.getReg();
5263 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5264 ++ConstantBusCount;
5265 SGPRsUsed.push_back(SGPRUsed);
5266 }
5267 } else if (!MO.isFI()) { // Treat FI like a register.
5268 if (!UsesLiteral) {
5269 ++ConstantBusCount;
5270 UsesLiteral = true;
5271 LiteralVal = &MO;
5272 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5273 assert(isVOP2(MI) || isVOP3(MI));
5274 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5275 return false;
5276 }
5277 }
5278 }
5279 }
5280
5281 SGPRUsed = findImplicitSGPRRead(MI);
5282 if (SGPRUsed) {
5283 // Implicit uses may safely overlap true operands
5284 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5285 return !RI.regsOverlap(SGPRUsed, SGPR);
5286 })) {
5287 ++ConstantBusCount;
5288 SGPRsUsed.push_back(SGPRUsed);
5289 }
5290 }
5291
5292 // v_writelane_b32 is an exception from constant bus restriction:
5293 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5294 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5295 Opcode != AMDGPU::V_WRITELANE_B32) {
5296 ErrInfo = "VOP* instruction violates constant bus restriction";
5297 return false;
5298 }
5299
5300 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5301 ErrInfo = "VOP3 instruction uses literal";
5302 return false;
5303 }
5304 }
5305
5306 // Special case for writelane - this can break the multiple constant bus rule,
5307 // but still can't use more than one SGPR register
5308 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5309 unsigned SGPRCount = 0;
5310 Register SGPRUsed;
5311
5312 for (int OpIdx : {Src0Idx, Src1Idx}) {
5313 if (OpIdx == -1)
5314 break;
5315
5316 const MachineOperand &MO = MI.getOperand(OpIdx);
5317
5318 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5319 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5320 if (MO.getReg() != SGPRUsed)
5321 ++SGPRCount;
5322 SGPRUsed = MO.getReg();
5323 }
5324 }
5325 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5326 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5327 return false;
5328 }
5329 }
5330 }
5331
5332 // Verify misc. restrictions on specific instructions.
5333 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5334 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5335 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5336 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5337 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5338 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5339 if (!compareMachineOp(Src0, Src1) &&
5340 !compareMachineOp(Src0, Src2)) {
5341 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5342 return false;
5343 }
5344 }
5345 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5346 SISrcMods::ABS) ||
5347 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5348 SISrcMods::ABS) ||
5349 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5350 SISrcMods::ABS)) {
5351 ErrInfo = "ABS not allowed in VOP3B instructions";
5352 return false;
5353 }
5354 }
5355
5356 if (isSOP2(MI) || isSOPC(MI)) {
5357 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5358 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5359
5360 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5361 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5362 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5363 !Src0.isIdenticalTo(Src1)) {
5364 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5365 return false;
5366 }
5367 }
5368
5369 if (isSOPK(MI)) {
5370 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5371 if (Desc.isBranch()) {
5372 if (!Op->isMBB()) {
5373 ErrInfo = "invalid branch target for SOPK instruction";
5374 return false;
5375 }
5376 } else {
5377 uint64_t Imm = Op->getImm();
5378 if (sopkIsZext(Opcode)) {
5379 if (!isUInt<16>(Imm)) {
5380 ErrInfo = "invalid immediate for SOPK instruction";
5381 return false;
5382 }
5383 } else {
5384 if (!isInt<16>(Imm)) {
5385 ErrInfo = "invalid immediate for SOPK instruction";
5386 return false;
5387 }
5388 }
5389 }
5390 }
5391
5392 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5393 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5394 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5395 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5396 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5397 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5398
5399 const unsigned StaticNumOps =
5400 Desc.getNumOperands() + Desc.implicit_uses().size();
5401 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5402
5403 // Allow additional implicit operands. This allows a fixup done by the post
5404 // RA scheduler where the main implicit operand is killed and implicit-defs
5405 // are added for sub-registers that remain live after this instruction.
5406 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5407 ErrInfo = "missing implicit register operands";
5408 return false;
5409 }
5410
5411 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5412 if (IsDst) {
5413 if (!Dst->isUse()) {
5414 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5415 return false;
5416 }
5417
5418 unsigned UseOpIdx;
5419 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5420 UseOpIdx != StaticNumOps + 1) {
5421 ErrInfo = "movrel implicit operands should be tied";
5422 return false;
5423 }
5424 }
5425
5426 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5427 const MachineOperand &ImpUse
5428 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5429 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5430 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5431 ErrInfo = "src0 should be subreg of implicit vector use";
5432 return false;
5433 }
5434 }
5435
5436 // Make sure we aren't losing exec uses in the td files. This mostly requires
5437 // being careful when using let Uses to try to add other use registers.
5438 if (shouldReadExec(MI)) {
5439 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5440 ErrInfo = "VALU instruction does not implicitly read exec mask";
5441 return false;
5442 }
5443 }
5444
5445 if (isSMRD(MI)) {
5446 if (MI.mayStore() &&
5447 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5448 // The register offset form of scalar stores may only use m0 as the
5449 // soffset register.
5450 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5451 if (Soff && Soff->getReg() != AMDGPU::M0) {
5452 ErrInfo = "scalar stores must use m0 as offset register";
5453 return false;
5454 }
5455 }
5456 }
5457
5458 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5459 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5460 if (Offset->getImm() != 0) {
5461 ErrInfo = "subtarget does not support offsets in flat instructions";
5462 return false;
5463 }
5464 }
5465
5466 if (isDS(MI) && !ST.hasGDS()) {
5467 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5468 if (GDSOp && GDSOp->getImm() != 0) {
5469 ErrInfo = "GDS is not supported on this subtarget";
5470 return false;
5471 }
5472 }
5473
5474 if (isImage(MI)) {
5475 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5476 if (DimOp) {
5477 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5478 AMDGPU::OpName::vaddr0);
5479 AMDGPU::OpName RSrcOpName =
5480 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5481 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5482 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5483 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5484 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5485 const AMDGPU::MIMGDimInfo *Dim =
5487
5488 if (!Dim) {
5489 ErrInfo = "dim is out of range";
5490 return false;
5491 }
5492
5493 bool IsA16 = false;
5494 if (ST.hasR128A16()) {
5495 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5496 IsA16 = R128A16->getImm() != 0;
5497 } else if (ST.hasA16()) {
5498 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5499 IsA16 = A16->getImm() != 0;
5500 }
5501
5502 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5503
5504 unsigned AddrWords =
5505 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5506
5507 unsigned VAddrWords;
5508 if (IsNSA) {
5509 VAddrWords = RsrcIdx - VAddr0Idx;
5510 if (ST.hasPartialNSAEncoding() &&
5511 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5512 unsigned LastVAddrIdx = RsrcIdx - 1;
5513 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5514 }
5515 } else {
5516 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5517 if (AddrWords > 12)
5518 AddrWords = 16;
5519 }
5520
5521 if (VAddrWords != AddrWords) {
5522 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5523 << " but got " << VAddrWords << "\n");
5524 ErrInfo = "bad vaddr size";
5525 return false;
5526 }
5527 }
5528 }
5529
5530 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5531 if (DppCt) {
5532 using namespace AMDGPU::DPP;
5533
5534 unsigned DC = DppCt->getImm();
5535 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5536 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5537 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5538 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5539 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5540 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5541 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5542 ErrInfo = "Invalid dpp_ctrl value";
5543 return false;
5544 }
5545 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5546 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5547 ErrInfo = "Invalid dpp_ctrl value: "
5548 "wavefront shifts are not supported on GFX10+";
5549 return false;
5550 }
5551 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5552 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5553 ErrInfo = "Invalid dpp_ctrl value: "
5554 "broadcasts are not supported on GFX10+";
5555 return false;
5556 }
5557 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5558 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5559 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5560 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5561 !ST.hasGFX90AInsts()) {
5562 ErrInfo = "Invalid dpp_ctrl value: "
5563 "row_newbroadcast/row_share is not supported before "
5564 "GFX90A/GFX10";
5565 return false;
5566 }
5567 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5568 ErrInfo = "Invalid dpp_ctrl value: "
5569 "row_share and row_xmask are not supported before GFX10";
5570 return false;
5571 }
5572 }
5573
5574 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5577 ErrInfo = "Invalid dpp_ctrl value: "
5578 "DP ALU dpp only support row_newbcast";
5579 return false;
5580 }
5581 }
5582
5583 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5584 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5585 AMDGPU::OpName DataName =
5586 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5587 const MachineOperand *Data = getNamedOperand(MI, DataName);
5588 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5589 if (Data && !Data->isReg())
5590 Data = nullptr;
5591
5592 if (ST.hasGFX90AInsts()) {
5593 if (Dst && Data &&
5594 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5595 ErrInfo = "Invalid register class: "
5596 "vdata and vdst should be both VGPR or AGPR";
5597 return false;
5598 }
5599 if (Data && Data2 &&
5600 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5601 ErrInfo = "Invalid register class: "
5602 "both data operands should be VGPR or AGPR";
5603 return false;
5604 }
5605 } else {
5606 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5607 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5608 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5609 ErrInfo = "Invalid register class: "
5610 "agpr loads and stores not supported on this GPU";
5611 return false;
5612 }
5613 }
5614 }
5615
5616 if (ST.needsAlignedVGPRs()) {
5617 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5619 if (!Op)
5620 return true;
5621 Register Reg = Op->getReg();
5622 if (Reg.isPhysical())
5623 return !(RI.getHWRegIndex(Reg) & 1);
5624 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5625 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5626 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5627 };
5628
5629 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5630 Opcode == AMDGPU::DS_GWS_BARRIER) {
5631
5632 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5633 ErrInfo = "Subtarget requires even aligned vector registers "
5634 "for DS_GWS instructions";
5635 return false;
5636 }
5637 }
5638
5639 if (isMIMG(MI)) {
5640 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5641 ErrInfo = "Subtarget requires even aligned vector registers "
5642 "for vaddr operand of image instructions";
5643 return false;
5644 }
5645 }
5646 }
5647
5648 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5649 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5650 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5651 ErrInfo = "Invalid register class: "
5652 "v_accvgpr_write with an SGPR is not supported on this GPU";
5653 return false;
5654 }
5655 }
5656
5657 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5658 const MachineOperand &SrcOp = MI.getOperand(1);
5659 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5660 ErrInfo = "pseudo expects only physical SGPRs";
5661 return false;
5662 }
5663 }
5664
5665 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5666 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5667 if (!ST.hasScaleOffset()) {
5668 ErrInfo = "Subtarget does not support offset scaling";
5669 return false;
5670 }
5671 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5672 ErrInfo = "Instruction does not support offset scaling";
5673 return false;
5674 }
5675 }
5676 }
5677
5678 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5679 // information.
5680 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5681 for (unsigned I = 0; I < 3; ++I) {
5683 return false;
5684 }
5685 }
5686
5687 return true;
5688}
5689
5690// It is more readable to list mapped opcodes on the same line.
5691// clang-format off
5692
5694 switch (MI.getOpcode()) {
5695 default: return AMDGPU::INSTRUCTION_LIST_END;
5696 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5697 case AMDGPU::COPY: return AMDGPU::COPY;
5698 case AMDGPU::PHI: return AMDGPU::PHI;
5699 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5700 case AMDGPU::WQM: return AMDGPU::WQM;
5701 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5702 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5703 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5704 case AMDGPU::S_MOV_B32: {
5705 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5706 return MI.getOperand(1).isReg() ||
5707 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5708 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5709 }
5710 case AMDGPU::S_ADD_I32:
5711 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5712 case AMDGPU::S_ADDC_U32:
5713 return AMDGPU::V_ADDC_U32_e32;
5714 case AMDGPU::S_SUB_I32:
5715 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5716 // FIXME: These are not consistently handled, and selected when the carry is
5717 // used.
5718 case AMDGPU::S_ADD_U32:
5719 return AMDGPU::V_ADD_CO_U32_e32;
5720 case AMDGPU::S_SUB_U32:
5721 return AMDGPU::V_SUB_CO_U32_e32;
5722 case AMDGPU::S_ADD_U64_PSEUDO:
5723 return AMDGPU::V_ADD_U64_PSEUDO;
5724 case AMDGPU::S_SUB_U64_PSEUDO:
5725 return AMDGPU::V_SUB_U64_PSEUDO;
5726 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5727 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5728 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5729 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5730 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5731 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5732 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5733 case AMDGPU::S_XNOR_B32:
5734 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5735 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5736 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5737 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5738 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5739 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5740 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5741 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5742 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5743 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5744 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5745 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5746 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5747 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5748 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5749 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5750 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5751 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5752 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5753 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5754 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5755 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5756 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5757 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5758 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5759 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5760 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5761 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5762 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5763 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5764 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5765 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5766 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5767 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5768 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5769 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5770 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5771 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5772 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5773 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5774 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5775 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5776 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5777 case AMDGPU::S_CVT_F32_F16:
5778 case AMDGPU::S_CVT_HI_F32_F16:
5779 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5780 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5781 case AMDGPU::S_CVT_F16_F32:
5782 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5783 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5784 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5785 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5786 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5787 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5788 case AMDGPU::S_CEIL_F16:
5789 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5790 : AMDGPU::V_CEIL_F16_fake16_e64;
5791 case AMDGPU::S_FLOOR_F16:
5792 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5793 : AMDGPU::V_FLOOR_F16_fake16_e64;
5794 case AMDGPU::S_TRUNC_F16:
5795 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
5796 : AMDGPU::V_TRUNC_F16_fake16_e64;
5797 case AMDGPU::S_RNDNE_F16:
5798 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
5799 : AMDGPU::V_RNDNE_F16_fake16_e64;
5800 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5801 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5802 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5803 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5804 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5805 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5806 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5807 case AMDGPU::S_ADD_F16:
5808 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
5809 : AMDGPU::V_ADD_F16_fake16_e64;
5810 case AMDGPU::S_SUB_F16:
5811 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
5812 : AMDGPU::V_SUB_F16_fake16_e64;
5813 case AMDGPU::S_MIN_F16:
5814 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
5815 : AMDGPU::V_MIN_F16_fake16_e64;
5816 case AMDGPU::S_MAX_F16:
5817 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
5818 : AMDGPU::V_MAX_F16_fake16_e64;
5819 case AMDGPU::S_MINIMUM_F16:
5820 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5821 : AMDGPU::V_MINIMUM_F16_fake16_e64;
5822 case AMDGPU::S_MAXIMUM_F16:
5823 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5824 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
5825 case AMDGPU::S_MUL_F16:
5826 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
5827 : AMDGPU::V_MUL_F16_fake16_e64;
5828 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5829 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5830 case AMDGPU::S_FMAC_F16:
5831 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5832 : AMDGPU::V_FMAC_F16_fake16_e64;
5833 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5834 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5835 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5836 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5837 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5838 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5839 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5840 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5841 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5842 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5843 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5844 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5845 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5846 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5847 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5848 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5849 case AMDGPU::S_CMP_LT_F16:
5850 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5851 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5852 case AMDGPU::S_CMP_EQ_F16:
5853 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5854 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5855 case AMDGPU::S_CMP_LE_F16:
5856 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5857 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5858 case AMDGPU::S_CMP_GT_F16:
5859 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5860 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5861 case AMDGPU::S_CMP_LG_F16:
5862 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5863 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5864 case AMDGPU::S_CMP_GE_F16:
5865 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5866 : AMDGPU::V_CMP_GE_F16_fake16_e64;
5867 case AMDGPU::S_CMP_O_F16:
5868 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5869 : AMDGPU::V_CMP_O_F16_fake16_e64;
5870 case AMDGPU::S_CMP_U_F16:
5871 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5872 : AMDGPU::V_CMP_U_F16_fake16_e64;
5873 case AMDGPU::S_CMP_NGE_F16:
5874 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5875 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5876 case AMDGPU::S_CMP_NLG_F16:
5877 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5878 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5879 case AMDGPU::S_CMP_NGT_F16:
5880 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5881 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
5882 case AMDGPU::S_CMP_NLE_F16:
5883 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5884 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
5885 case AMDGPU::S_CMP_NEQ_F16:
5886 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5887 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
5888 case AMDGPU::S_CMP_NLT_F16:
5889 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5890 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
5891 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5892 case AMDGPU::V_S_EXP_F16_e64:
5893 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
5894 : AMDGPU::V_EXP_F16_fake16_e64;
5895 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5896 case AMDGPU::V_S_LOG_F16_e64:
5897 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
5898 : AMDGPU::V_LOG_F16_fake16_e64;
5899 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5900 case AMDGPU::V_S_RCP_F16_e64:
5901 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
5902 : AMDGPU::V_RCP_F16_fake16_e64;
5903 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5904 case AMDGPU::V_S_RSQ_F16_e64:
5905 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
5906 : AMDGPU::V_RSQ_F16_fake16_e64;
5907 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5908 case AMDGPU::V_S_SQRT_F16_e64:
5909 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
5910 : AMDGPU::V_SQRT_F16_fake16_e64;
5911 }
5913 "Unexpected scalar opcode without corresponding vector one!");
5914}
5915
5916// clang-format on
5917
5921 const DebugLoc &DL, Register Reg,
5922 bool IsSCCLive,
5923 SlotIndexes *Indexes) const {
5924 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5925 const SIInstrInfo *TII = ST.getInstrInfo();
5926 bool IsWave32 = ST.isWave32();
5927 if (IsSCCLive) {
5928 // Insert two move instructions, one to save the original value of EXEC and
5929 // the other to turn on all bits in EXEC. This is required as we can't use
5930 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5931 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5932 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5933 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5934 .addReg(Exec, RegState::Kill);
5935 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5936 if (Indexes) {
5937 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5938 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5939 }
5940 } else {
5941 const unsigned OrSaveExec =
5942 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5943 auto SaveExec =
5944 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5945 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5946 if (Indexes)
5947 Indexes->insertMachineInstrInMaps(*SaveExec);
5948 }
5949}
5950
5953 const DebugLoc &DL, Register Reg,
5954 SlotIndexes *Indexes) const {
5955 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5956 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5957 auto ExecRestoreMI =
5958 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5959 if (Indexes)
5960 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5961}
5962
5966 "Not a whole wave func");
5967 MachineBasicBlock &MBB = *MF.begin();
5968 for (MachineInstr &MI : MBB)
5969 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
5970 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
5971 return &MI;
5972
5973 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
5974}
5975
5976static const TargetRegisterClass *
5978 const MCInstrDesc &TID, unsigned RCID,
5979 bool IsAllocatable) {
5980 if ((IsAllocatable || !ST.hasGFX90AInsts()) &&
5981 (((TID.mayLoad() || TID.mayStore()) &&
5982 !(TID.TSFlags & SIInstrFlags::Spill)) ||
5983 (TID.TSFlags & SIInstrFlags::MIMG))) {
5984 switch (RCID) {
5985 case AMDGPU::AV_32RegClassID:
5986 RCID = AMDGPU::VGPR_32RegClassID;
5987 break;
5988 case AMDGPU::AV_64RegClassID:
5989 RCID = AMDGPU::VReg_64RegClassID;
5990 break;
5991 case AMDGPU::AV_96RegClassID:
5992 RCID = AMDGPU::VReg_96RegClassID;
5993 break;
5994 case AMDGPU::AV_128RegClassID:
5995 RCID = AMDGPU::VReg_128RegClassID;
5996 break;
5997 case AMDGPU::AV_160RegClassID:
5998 RCID = AMDGPU::VReg_160RegClassID;
5999 break;
6000 case AMDGPU::AV_512RegClassID:
6001 RCID = AMDGPU::VReg_512RegClassID;
6002 break;
6003 default:
6004 break;
6005 }
6006 }
6007
6008 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
6009}
6010
6012 unsigned OpNum, const TargetRegisterInfo *TRI,
6013 const MachineFunction &MF)
6014 const {
6015 if (OpNum >= TID.getNumOperands())
6016 return nullptr;
6017 auto RegClass = TID.operands()[OpNum].RegClass;
6018 if (TID.getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
6019 // Special pseudos have no alignment requirement
6020 return RI.getRegClass(RegClass);
6021 }
6022
6023 return adjustAllocatableRegClass(ST, RI, TID, RegClass, false);
6024}
6025
6027 unsigned OpNo) const {
6028 const MCInstrDesc &Desc = get(MI.getOpcode());
6029 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6030 Desc.operands()[OpNo].RegClass == -1) {
6031 Register Reg = MI.getOperand(OpNo).getReg();
6032
6033 if (Reg.isVirtual()) {
6034 const MachineRegisterInfo &MRI =
6035 MI.getParent()->getParent()->getRegInfo();
6036 return MRI.getRegClass(Reg);
6037 }
6038 return RI.getPhysRegBaseClass(Reg);
6039 }
6040
6041 unsigned RCID = Desc.operands()[OpNo].RegClass;
6042 return adjustAllocatableRegClass(ST, RI, Desc, RCID, true);
6043}
6044
6047 MachineBasicBlock *MBB = MI.getParent();
6048 MachineOperand &MO = MI.getOperand(OpIdx);
6049 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6050 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
6051 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6052 unsigned Size = RI.getRegSizeInBits(*RC);
6053 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6054 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6055 : AMDGPU::V_MOV_B32_e32;
6056 if (MO.isReg())
6057 Opcode = AMDGPU::COPY;
6058 else if (RI.isSGPRClass(RC))
6059 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6060
6061 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6062 Register Reg = MRI.createVirtualRegister(VRC);
6063 DebugLoc DL = MBB->findDebugLoc(I);
6064 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6065 MO.ChangeToRegister(Reg, false);
6066}
6067
6070 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6071 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6072 if (!SuperReg.getReg().isVirtual())
6073 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6074
6075 MachineBasicBlock *MBB = MI->getParent();
6076 const DebugLoc &DL = MI->getDebugLoc();
6077 Register SubReg = MRI.createVirtualRegister(SubRC);
6078
6079 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6080 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6081 .addReg(SuperReg.getReg(), 0, NewSubIdx);
6082 return SubReg;
6083}
6084
6087 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6088 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6089 if (Op.isImm()) {
6090 if (SubIdx == AMDGPU::sub0)
6091 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6092 if (SubIdx == AMDGPU::sub1)
6093 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6094
6095 llvm_unreachable("Unhandled register index for immediate");
6096 }
6097
6098 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6099 SubIdx, SubRC);
6100 return MachineOperand::CreateReg(SubReg, false);
6101}
6102
6103// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6104void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6105 assert(Inst.getNumExplicitOperands() == 3);
6106 MachineOperand Op1 = Inst.getOperand(1);
6107 Inst.removeOperand(1);
6108 Inst.addOperand(Op1);
6109}
6110
6112 const MCOperandInfo &OpInfo,
6113 const MachineOperand &MO) const {
6114 if (!MO.isReg())
6115 return false;
6116
6117 Register Reg = MO.getReg();
6118
6119 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
6120 if (Reg.isPhysical())
6121 return DRC->contains(Reg);
6122
6123 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6124
6125 if (MO.getSubReg()) {
6126 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
6127 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6128 if (!SuperRC)
6129 return false;
6130
6131 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
6132 if (!DRC)
6133 return false;
6134 }
6135 return RC->hasSuperClassEq(DRC);
6136}
6137
6139 const MachineOperand &MO) const {
6140 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6141 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6142 unsigned Opc = MI.getOpcode();
6143
6144 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6145 // information.
6146 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6147 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6148 constexpr const AMDGPU::OpName OpNames[] = {
6149 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6150
6151 for (auto [I, OpName] : enumerate(OpNames)) {
6152 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6153 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6155 return false;
6156 }
6157 }
6158
6159 if (!isLegalRegOperand(MRI, OpInfo, MO))
6160 return false;
6161
6162 // check Accumulate GPR operand
6163 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6164 if (IsAGPR && !ST.hasMAIInsts())
6165 return false;
6166 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6167 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6168 return false;
6169 // Atomics should have both vdst and vdata either vgpr or agpr.
6170 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6171 const int DataIdx = AMDGPU::getNamedOperandIdx(
6172 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6173 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6174 MI.getOperand(DataIdx).isReg() &&
6175 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6176 return false;
6177 if ((int)OpIdx == DataIdx) {
6178 if (VDstIdx != -1 &&
6179 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6180 return false;
6181 // DS instructions with 2 src operands also must have tied RC.
6182 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6183 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6184 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6185 return false;
6186 }
6187
6188 // Check V_ACCVGPR_WRITE_B32_e64
6189 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6190 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6191 RI.isSGPRReg(MRI, MO.getReg()))
6192 return false;
6193 return true;
6194}
6195
6197 const MCOperandInfo &OpInfo,
6198 const MachineOperand &MO) const {
6199 if (MO.isReg())
6200 return isLegalRegOperand(MRI, OpInfo, MO);
6201
6202 // Handle non-register types that are treated like immediates.
6203 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6204 return true;
6205}
6206
6208 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6209 const MachineOperand *MO) const {
6210 constexpr const unsigned NumOps = 3;
6211 constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
6212 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6213 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6214 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6215
6216 assert(SrcN < NumOps);
6217
6218 if (!MO) {
6219 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6220 if (SrcIdx == -1)
6221 return true;
6222 MO = &MI.getOperand(SrcIdx);
6223 }
6224
6225 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6226 return true;
6227
6228 int ModsIdx =
6229 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6230 if (ModsIdx == -1)
6231 return true;
6232
6233 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6234 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6235 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6236
6237 return !OpSel && !OpSelHi;
6238}
6239
6241 const MachineOperand *MO) const {
6242 const MachineFunction &MF = *MI.getParent()->getParent();
6243 const MachineRegisterInfo &MRI = MF.getRegInfo();
6244 const MCInstrDesc &InstDesc = MI.getDesc();
6245 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6246 const TargetRegisterClass *DefinedRC =
6247 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
6248 if (!MO)
6249 MO = &MI.getOperand(OpIdx);
6250
6251 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6252
6253 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6254 const MachineOperand *UsedLiteral = nullptr;
6255
6256 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6257 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6258
6259 // TODO: Be more permissive with frame indexes.
6260 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6261 if (!LiteralLimit--)
6262 return false;
6263
6264 UsedLiteral = MO;
6265 }
6266
6268 if (MO->isReg())
6269 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6270
6271 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6272 if (i == OpIdx)
6273 continue;
6274 const MachineOperand &Op = MI.getOperand(i);
6275 if (Op.isReg()) {
6276 if (Op.isUse()) {
6277 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6278 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6279 if (--ConstantBusLimit <= 0)
6280 return false;
6281 }
6282 }
6283 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6284 !isInlineConstant(Op, InstDesc.operands()[i])) {
6285 // The same literal may be used multiple times.
6286 if (!UsedLiteral)
6287 UsedLiteral = &Op;
6288 else if (UsedLiteral->isIdenticalTo(Op))
6289 continue;
6290
6291 if (!LiteralLimit--)
6292 return false;
6293 if (--ConstantBusLimit <= 0)
6294 return false;
6295 }
6296 }
6297 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6298 // There can be at most one literal operand, but it can be repeated.
6299 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6300 if (i == OpIdx)
6301 continue;
6302 const MachineOperand &Op = MI.getOperand(i);
6303 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6304 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6305 !Op.isIdenticalTo(*MO))
6306 return false;
6307
6308 // Do not fold a non-inlineable and non-register operand into an
6309 // instruction that already has a frame index. The frame index handling
6310 // code could not handle well when a frame index co-exists with another
6311 // non-register operand, unless that operand is an inlineable immediate.
6312 if (Op.isFI())
6313 return false;
6314 }
6315 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6316 isF16PseudoScalarTrans(MI.getOpcode())) {
6317 return false;
6318 }
6319
6320 if (MO->isReg()) {
6321 if (!DefinedRC)
6322 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6323 return isLegalRegOperand(MI, OpIdx, *MO);
6324 }
6325
6326 if (MO->isImm()) {
6327 uint64_t Imm = MO->getImm();
6328 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6329 bool Is64BitOp = Is64BitFPOp ||
6330 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6331 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6332 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6333 if (Is64BitOp &&
6334 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6335 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6336 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6337 return false;
6338
6339 // FIXME: We can use sign extended 64-bit literals, but only for signed
6340 // operands. At the moment we do not know if an operand is signed.
6341 // Such operand will be encoded as its low 32 bits and then either
6342 // correctly sign extended or incorrectly zero extended by HW.
6343 // If 64-bit literals are supported and the literal will be encoded
6344 // as full 64 bit we still can use it.
6345 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6346 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6347 return false;
6348 }
6349 }
6350
6351 // Handle non-register types that are treated like immediates.
6352 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6353
6354 if (!DefinedRC) {
6355 // This operand expects an immediate.
6356 return true;
6357 }
6358
6359 return isImmOperandLegal(MI, OpIdx, *MO);
6360}
6361
6363 MachineInstr &MI) const {
6364 unsigned Opc = MI.getOpcode();
6365 const MCInstrDesc &InstrDesc = get(Opc);
6366
6367 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6368 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6369
6370 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6371 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6372
6373 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6374 // we need to only have one constant bus use before GFX10.
6375 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6376 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6377 RI.isSGPRReg(MRI, Src0.getReg()))
6378 legalizeOpWithMove(MI, Src0Idx);
6379
6380 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6381 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6382 // src0/src1 with V_READFIRSTLANE.
6383 if (Opc == AMDGPU::V_WRITELANE_B32) {
6384 const DebugLoc &DL = MI.getDebugLoc();
6385 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6386 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6387 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6388 .add(Src0);
6389 Src0.ChangeToRegister(Reg, false);
6390 }
6391 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6392 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6393 const DebugLoc &DL = MI.getDebugLoc();
6394 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6395 .add(Src1);
6396 Src1.ChangeToRegister(Reg, false);
6397 }
6398 return;
6399 }
6400
6401 // No VOP2 instructions support AGPRs.
6402 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
6403 legalizeOpWithMove(MI, Src0Idx);
6404
6405 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
6406 legalizeOpWithMove(MI, Src1Idx);
6407
6408 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6409 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6410 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6411 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6412 legalizeOpWithMove(MI, Src2Idx);
6413 }
6414
6415 // VOP2 src0 instructions support all operand types, so we don't need to check
6416 // their legality. If src1 is already legal, we don't need to do anything.
6417 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6418 return;
6419
6420 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6421 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6422 // select is uniform.
6423 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6424 RI.isVGPR(MRI, Src1.getReg())) {
6425 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6426 const DebugLoc &DL = MI.getDebugLoc();
6427 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6428 .add(Src1);
6429 Src1.ChangeToRegister(Reg, false);
6430 return;
6431 }
6432
6433 // We do not use commuteInstruction here because it is too aggressive and will
6434 // commute if it is possible. We only want to commute here if it improves
6435 // legality. This can be called a fairly large number of times so don't waste
6436 // compile time pointlessly swapping and checking legality again.
6437 if (HasImplicitSGPR || !MI.isCommutable()) {
6438 legalizeOpWithMove(MI, Src1Idx);
6439 return;
6440 }
6441
6442 // If src0 can be used as src1, commuting will make the operands legal.
6443 // Otherwise we have to give up and insert a move.
6444 //
6445 // TODO: Other immediate-like operand kinds could be commuted if there was a
6446 // MachineOperand::ChangeTo* for them.
6447 if ((!Src1.isImm() && !Src1.isReg()) ||
6448 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6449 legalizeOpWithMove(MI, Src1Idx);
6450 return;
6451 }
6452
6453 int CommutedOpc = commuteOpcode(MI);
6454 if (CommutedOpc == -1) {
6455 legalizeOpWithMove(MI, Src1Idx);
6456 return;
6457 }
6458
6459 MI.setDesc(get(CommutedOpc));
6460
6461 Register Src0Reg = Src0.getReg();
6462 unsigned Src0SubReg = Src0.getSubReg();
6463 bool Src0Kill = Src0.isKill();
6464
6465 if (Src1.isImm())
6466 Src0.ChangeToImmediate(Src1.getImm());
6467 else if (Src1.isReg()) {
6468 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6469 Src0.setSubReg(Src1.getSubReg());
6470 } else
6471 llvm_unreachable("Should only have register or immediate operands");
6472
6473 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6474 Src1.setSubReg(Src0SubReg);
6476}
6477
6478// Legalize VOP3 operands. All operand types are supported for any operand
6479// but only one literal constant and only starting from GFX10.
6481 MachineInstr &MI) const {
6482 unsigned Opc = MI.getOpcode();
6483
6484 int VOP3Idx[3] = {
6485 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6486 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6487 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6488 };
6489
6490 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6491 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6492 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6493 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6494 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6495 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6496 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6497 // src1 and src2 must be scalar
6498 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6499 const DebugLoc &DL = MI.getDebugLoc();
6500 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6501 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6502 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6503 .add(Src1);
6504 Src1.ChangeToRegister(Reg, false);
6505 }
6506 if (VOP3Idx[2] != -1) {
6507 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6508 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6509 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6510 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6511 .add(Src2);
6512 Src2.ChangeToRegister(Reg, false);
6513 }
6514 }
6515 }
6516
6517 // Find the one SGPR operand we are allowed to use.
6518 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6519 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6520 SmallDenseSet<unsigned> SGPRsUsed;
6521 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6522 if (SGPRReg) {
6523 SGPRsUsed.insert(SGPRReg);
6524 --ConstantBusLimit;
6525 }
6526
6527 for (int Idx : VOP3Idx) {
6528 if (Idx == -1)
6529 break;
6530 MachineOperand &MO = MI.getOperand(Idx);
6531
6532 if (!MO.isReg()) {
6533 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6534 continue;
6535
6536 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6537 --LiteralLimit;
6538 --ConstantBusLimit;
6539 continue;
6540 }
6541
6542 --LiteralLimit;
6543 --ConstantBusLimit;
6544 legalizeOpWithMove(MI, Idx);
6545 continue;
6546 }
6547
6548 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
6549 !isOperandLegal(MI, Idx, &MO)) {
6550 legalizeOpWithMove(MI, Idx);
6551 continue;
6552 }
6553
6554 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6555 continue; // VGPRs are legal
6556
6557 // We can use one SGPR in each VOP3 instruction prior to GFX10
6558 // and two starting from GFX10.
6559 if (SGPRsUsed.count(MO.getReg()))
6560 continue;
6561 if (ConstantBusLimit > 0) {
6562 SGPRsUsed.insert(MO.getReg());
6563 --ConstantBusLimit;
6564 continue;
6565 }
6566
6567 // If we make it this far, then the operand is not legal and we must
6568 // legalize it.
6569 legalizeOpWithMove(MI, Idx);
6570 }
6571
6572 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6573 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6574 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6575 legalizeOpWithMove(MI, VOP3Idx[2]);
6576
6577 if (isWMMA(MI)) {
6578 // scale_src has a register class restricted to low 256 VGPRs, we may need
6579 // to insert a copy to the restricted VGPR class.
6580 int ScaleSrc0Idx =
6581 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src0);
6582 if (ScaleSrc0Idx != -1) {
6583 int ScaleSrc1Idx =
6584 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src1);
6585 if (!isOperandLegal(MI, ScaleSrc0Idx))
6586 legalizeOpWithMove(MI, ScaleSrc0Idx);
6587 if (!isOperandLegal(MI, ScaleSrc1Idx))
6588 legalizeOpWithMove(MI, ScaleSrc1Idx);
6589 }
6590 }
6591
6592 // Fix the register class of packed FP32 instructions on gfx12+. See
6593 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6595 for (unsigned I = 0; I < 3; ++I) {
6597 legalizeOpWithMove(MI, VOP3Idx[I]);
6598 }
6599 }
6600}
6601
6604 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6605 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6606 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6607 if (DstRC)
6608 SRC = RI.getCommonSubClass(SRC, DstRC);
6609
6610 Register DstReg = MRI.createVirtualRegister(SRC);
6611 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6612
6613 if (RI.hasAGPRs(VRC)) {
6614 VRC = RI.getEquivalentVGPRClass(VRC);
6615 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6616 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6617 get(TargetOpcode::COPY), NewSrcReg)
6618 .addReg(SrcReg);
6619 SrcReg = NewSrcReg;
6620 }
6621
6622 if (SubRegs == 1) {
6623 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6624 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6625 .addReg(SrcReg);
6626 return DstReg;
6627 }
6628
6630 for (unsigned i = 0; i < SubRegs; ++i) {
6631 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6632 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6633 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6634 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6635 SRegs.push_back(SGPR);
6636 }
6637
6639 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6640 get(AMDGPU::REG_SEQUENCE), DstReg);
6641 for (unsigned i = 0; i < SubRegs; ++i) {
6642 MIB.addReg(SRegs[i]);
6643 MIB.addImm(RI.getSubRegFromChannel(i));
6644 }
6645 return DstReg;
6646}
6647
6649 MachineInstr &MI) const {
6650
6651 // If the pointer is store in VGPRs, then we need to move them to
6652 // SGPRs using v_readfirstlane. This is safe because we only select
6653 // loads with uniform pointers to SMRD instruction so we know the
6654 // pointer value is uniform.
6655 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6656 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6657 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6658 SBase->setReg(SGPR);
6659 }
6660 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6661 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6662 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6663 SOff->setReg(SGPR);
6664 }
6665}
6666
6668 unsigned Opc = Inst.getOpcode();
6669 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6670 if (OldSAddrIdx < 0)
6671 return false;
6672
6673 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6674
6675 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6676 if (NewOpc < 0)
6678 if (NewOpc < 0)
6679 return false;
6680
6682 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6683 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6684 return false;
6685
6686 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6687 if (NewVAddrIdx < 0)
6688 return false;
6689
6690 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6691
6692 // Check vaddr, it shall be zero or absent.
6693 MachineInstr *VAddrDef = nullptr;
6694 if (OldVAddrIdx >= 0) {
6695 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6696 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6697 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6698 !VAddrDef->getOperand(1).isImm() ||
6699 VAddrDef->getOperand(1).getImm() != 0)
6700 return false;
6701 }
6702
6703 const MCInstrDesc &NewDesc = get(NewOpc);
6704 Inst.setDesc(NewDesc);
6705
6706 // Callers expect iterator to be valid after this call, so modify the
6707 // instruction in place.
6708 if (OldVAddrIdx == NewVAddrIdx) {
6709 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6710 // Clear use list from the old vaddr holding a zero register.
6711 MRI.removeRegOperandFromUseList(&NewVAddr);
6712 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6713 Inst.removeOperand(OldSAddrIdx);
6714 // Update the use list with the pointer we have just moved from vaddr to
6715 // saddr position. Otherwise new vaddr will be missing from the use list.
6716 MRI.removeRegOperandFromUseList(&NewVAddr);
6717 MRI.addRegOperandToUseList(&NewVAddr);
6718 } else {
6719 assert(OldSAddrIdx == NewVAddrIdx);
6720
6721 if (OldVAddrIdx >= 0) {
6722 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6723 AMDGPU::OpName::vdst_in);
6724
6725 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6726 // it asserts. Untie the operands for now and retie them afterwards.
6727 if (NewVDstIn != -1) {
6728 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6729 Inst.untieRegOperand(OldVDstIn);
6730 }
6731
6732 Inst.removeOperand(OldVAddrIdx);
6733
6734 if (NewVDstIn != -1) {
6735 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6736 Inst.tieOperands(NewVDst, NewVDstIn);
6737 }
6738 }
6739 }
6740
6741 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6742 VAddrDef->eraseFromParent();
6743
6744 return true;
6745}
6746
6747// FIXME: Remove this when SelectionDAG is obsoleted.
6749 MachineInstr &MI) const {
6750 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6751 return;
6752
6753 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6754 // thinks they are uniform, so a readfirstlane should be valid.
6755 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6756 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6757 return;
6758
6760 return;
6761
6762 const TargetRegisterClass *DeclaredRC = getRegClass(
6763 MI.getDesc(), SAddr->getOperandNo(), &RI, *MI.getParent()->getParent());
6764
6765 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6766 SAddr->setReg(ToSGPR);
6767}
6768
6771 const TargetRegisterClass *DstRC,
6774 const DebugLoc &DL) const {
6775 Register OpReg = Op.getReg();
6776 unsigned OpSubReg = Op.getSubReg();
6777
6778 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6779 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6780
6781 // Check if operand is already the correct register class.
6782 if (DstRC == OpRC)
6783 return;
6784
6785 Register DstReg = MRI.createVirtualRegister(DstRC);
6786 auto Copy =
6787 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6788 Op.setReg(DstReg);
6789
6790 MachineInstr *Def = MRI.getVRegDef(OpReg);
6791 if (!Def)
6792 return;
6793
6794 // Try to eliminate the copy if it is copying an immediate value.
6795 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6796 foldImmediate(*Copy, *Def, OpReg, &MRI);
6797
6798 bool ImpDef = Def->isImplicitDef();
6799 while (!ImpDef && Def && Def->isCopy()) {
6800 if (Def->getOperand(1).getReg().isPhysical())
6801 break;
6802 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6803 ImpDef = Def && Def->isImplicitDef();
6804 }
6805 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6806 !ImpDef)
6807 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6808}
6809
6810// Emit the actual waterfall loop, executing the wrapped instruction for each
6811// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6812// iteration, in the worst case we execute 64 (once per lane).
6813static void
6816 MachineBasicBlock &LoopBB,
6817 MachineBasicBlock &BodyBB,
6818 const DebugLoc &DL,
6819 ArrayRef<MachineOperand *> ScalarOps) {
6820 MachineFunction &MF = *LoopBB.getParent();
6821 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6822 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6823 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6824 unsigned SaveExecOpc =
6825 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6826 unsigned XorTermOpc =
6827 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6828 unsigned AndOpc =
6829 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6830 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6831
6833 Register CondReg;
6834
6835 for (MachineOperand *ScalarOp : ScalarOps) {
6836 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6837 unsigned NumSubRegs = RegSize / 32;
6838 Register VScalarOp = ScalarOp->getReg();
6839
6840 if (NumSubRegs == 1) {
6841 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6842
6843 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6844 .addReg(VScalarOp);
6845
6846 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6847
6848 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6849 .addReg(CurReg)
6850 .addReg(VScalarOp);
6851
6852 // Combine the comparison results with AND.
6853 if (!CondReg) // First.
6854 CondReg = NewCondReg;
6855 else { // If not the first, we create an AND.
6856 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6857 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6858 .addReg(CondReg)
6859 .addReg(NewCondReg);
6860 CondReg = AndReg;
6861 }
6862
6863 // Update ScalarOp operand to use the SGPR ScalarOp.
6864 ScalarOp->setReg(CurReg);
6865 ScalarOp->setIsKill();
6866 } else {
6867 SmallVector<Register, 8> ReadlanePieces;
6868 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6869 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6870 "Unhandled register size");
6871
6872 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6873 Register CurRegLo =
6874 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6875 Register CurRegHi =
6876 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6877
6878 // Read the next variant <- also loop target.
6879 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6880 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6881
6882 // Read the next variant <- also loop target.
6883 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6884 .addReg(VScalarOp, VScalarOpUndef,
6885 TRI->getSubRegFromChannel(Idx + 1));
6886
6887 ReadlanePieces.push_back(CurRegLo);
6888 ReadlanePieces.push_back(CurRegHi);
6889
6890 // Comparison is to be done as 64-bit.
6891 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6892 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6893 .addReg(CurRegLo)
6894 .addImm(AMDGPU::sub0)
6895 .addReg(CurRegHi)
6896 .addImm(AMDGPU::sub1);
6897
6898 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6899 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6900 NewCondReg)
6901 .addReg(CurReg);
6902 if (NumSubRegs <= 2)
6903 Cmp.addReg(VScalarOp);
6904 else
6905 Cmp.addReg(VScalarOp, VScalarOpUndef,
6906 TRI->getSubRegFromChannel(Idx, 2));
6907
6908 // Combine the comparison results with AND.
6909 if (!CondReg) // First.
6910 CondReg = NewCondReg;
6911 else { // If not the first, we create an AND.
6912 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6913 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6914 .addReg(CondReg)
6915 .addReg(NewCondReg);
6916 CondReg = AndReg;
6917 }
6918 } // End for loop.
6919
6920 const auto *SScalarOpRC =
6921 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6922 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6923
6924 // Build scalar ScalarOp.
6925 auto Merge =
6926 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6927 unsigned Channel = 0;
6928 for (Register Piece : ReadlanePieces) {
6929 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6930 }
6931
6932 // Update ScalarOp operand to use the SGPR ScalarOp.
6933 ScalarOp->setReg(SScalarOp);
6934 ScalarOp->setIsKill();
6935 }
6936 }
6937
6938 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6939 MRI.setSimpleHint(SaveExec, CondReg);
6940
6941 // Update EXEC to matching lanes, saving original to SaveExec.
6942 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6943 .addReg(CondReg, RegState::Kill);
6944
6945 // The original instruction is here; we insert the terminators after it.
6946 I = BodyBB.end();
6947
6948 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6949 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6950 .addReg(Exec)
6951 .addReg(SaveExec);
6952
6953 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6954}
6955
6956// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6957// with SGPRs by iterating over all unique values across all lanes.
6958// Returns the loop basic block that now contains \p MI.
6959static MachineBasicBlock *
6963 MachineBasicBlock::iterator Begin = nullptr,
6964 MachineBasicBlock::iterator End = nullptr) {
6965 MachineBasicBlock &MBB = *MI.getParent();
6966 MachineFunction &MF = *MBB.getParent();
6967 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6968 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6970 if (!Begin.isValid())
6971 Begin = &MI;
6972 if (!End.isValid()) {
6973 End = &MI;
6974 ++End;
6975 }
6976 const DebugLoc &DL = MI.getDebugLoc();
6977 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6978 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6979 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6980
6981 // Save SCC. Waterfall Loop may overwrite SCC.
6982 Register SaveSCCReg;
6983
6984 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
6985 // rather than unlimited scan everywhere
6986 bool SCCNotDead =
6987 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
6988 std::numeric_limits<unsigned>::max()) !=
6990 if (SCCNotDead) {
6991 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6992 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6993 .addImm(1)
6994 .addImm(0);
6995 }
6996
6997 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6998
6999 // Save the EXEC mask
7000 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
7001
7002 // Killed uses in the instruction we are waterfalling around will be
7003 // incorrect due to the added control-flow.
7005 ++AfterMI;
7006 for (auto I = Begin; I != AfterMI; I++) {
7007 for (auto &MO : I->all_uses())
7008 MRI.clearKillFlags(MO.getReg());
7009 }
7010
7011 // To insert the loop we need to split the block. Move everything after this
7012 // point to a new block, and insert a new empty block between the two.
7015 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7017 ++MBBI;
7018
7019 MF.insert(MBBI, LoopBB);
7020 MF.insert(MBBI, BodyBB);
7021 MF.insert(MBBI, RemainderBB);
7022
7023 LoopBB->addSuccessor(BodyBB);
7024 BodyBB->addSuccessor(LoopBB);
7025 BodyBB->addSuccessor(RemainderBB);
7026
7027 // Move Begin to MI to the BodyBB, and the remainder of the block to
7028 // RemainderBB.
7029 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7030 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7031 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7032
7033 MBB.addSuccessor(LoopBB);
7034
7035 // Update dominators. We know that MBB immediately dominates LoopBB, that
7036 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7037 // RemainderBB. RemainderBB immediately dominates all of the successors
7038 // transferred to it from MBB that MBB used to properly dominate.
7039 if (MDT) {
7040 MDT->addNewBlock(LoopBB, &MBB);
7041 MDT->addNewBlock(BodyBB, LoopBB);
7042 MDT->addNewBlock(RemainderBB, BodyBB);
7043 for (auto &Succ : RemainderBB->successors()) {
7044 if (MDT->properlyDominates(&MBB, Succ)) {
7045 MDT->changeImmediateDominator(Succ, RemainderBB);
7046 }
7047 }
7048 }
7049
7050 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7051
7052 MachineBasicBlock::iterator First = RemainderBB->begin();
7053 // Restore SCC
7054 if (SCCNotDead) {
7055 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7056 .addReg(SaveSCCReg, RegState::Kill)
7057 .addImm(0);
7058 }
7059
7060 // Restore the EXEC mask
7061 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
7062 return BodyBB;
7063}
7064
7065// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7066static std::tuple<unsigned, unsigned>
7068 MachineBasicBlock &MBB = *MI.getParent();
7069 MachineFunction &MF = *MBB.getParent();
7071
7072 // Extract the ptr from the resource descriptor.
7073 unsigned RsrcPtr =
7074 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7075 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7076
7077 // Create an empty resource descriptor
7078 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7079 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7080 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7081 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7082 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7083
7084 // Zero64 = 0
7085 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7086 .addImm(0);
7087
7088 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7089 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7090 .addImm(Lo_32(RsrcDataFormat));
7091
7092 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7093 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7094 .addImm(Hi_32(RsrcDataFormat));
7095
7096 // NewSRsrc = {Zero64, SRsrcFormat}
7097 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7098 .addReg(Zero64)
7099 .addImm(AMDGPU::sub0_sub1)
7100 .addReg(SRsrcFormatLo)
7101 .addImm(AMDGPU::sub2)
7102 .addReg(SRsrcFormatHi)
7103 .addImm(AMDGPU::sub3);
7104
7105 return std::tuple(RsrcPtr, NewSRsrc);
7106}
7107
7110 MachineDominatorTree *MDT) const {
7111 MachineFunction &MF = *MI.getParent()->getParent();
7113 MachineBasicBlock *CreatedBB = nullptr;
7114
7115 // Legalize VOP2
7116 if (isVOP2(MI) || isVOPC(MI)) {
7118 return CreatedBB;
7119 }
7120
7121 // Legalize VOP3
7122 if (isVOP3(MI)) {
7124 return CreatedBB;
7125 }
7126
7127 // Legalize SMRD
7128 if (isSMRD(MI)) {
7130 return CreatedBB;
7131 }
7132
7133 // Legalize FLAT
7134 if (isFLAT(MI)) {
7136 return CreatedBB;
7137 }
7138
7139 // Legalize REG_SEQUENCE and PHI
7140 // The register class of the operands much be the same type as the register
7141 // class of the output.
7142 if (MI.getOpcode() == AMDGPU::PHI) {
7143 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
7144 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
7145 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
7146 continue;
7147 const TargetRegisterClass *OpRC =
7148 MRI.getRegClass(MI.getOperand(i).getReg());
7149 if (RI.hasVectorRegisters(OpRC)) {
7150 VRC = OpRC;
7151 } else {
7152 SRC = OpRC;
7153 }
7154 }
7155
7156 // If any of the operands are VGPR registers, then they all most be
7157 // otherwise we will create illegal VGPR->SGPR copies when legalizing
7158 // them.
7159 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
7160 if (!VRC) {
7161 assert(SRC);
7162 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
7163 VRC = &AMDGPU::VReg_1RegClass;
7164 } else
7165 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7166 ? RI.getEquivalentAGPRClass(SRC)
7167 : RI.getEquivalentVGPRClass(SRC);
7168 } else {
7169 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7170 ? RI.getEquivalentAGPRClass(VRC)
7171 : RI.getEquivalentVGPRClass(VRC);
7172 }
7173 RC = VRC;
7174 } else {
7175 RC = SRC;
7176 }
7177
7178 // Update all the operands so they have the same type.
7179 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7180 MachineOperand &Op = MI.getOperand(I);
7181 if (!Op.isReg() || !Op.getReg().isVirtual())
7182 continue;
7183
7184 // MI is a PHI instruction.
7185 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7187
7188 // Avoid creating no-op copies with the same src and dst reg class. These
7189 // confuse some of the machine passes.
7190 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
7191 }
7192 }
7193
7194 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7195 // VGPR dest type and SGPR sources, insert copies so all operands are
7196 // VGPRs. This seems to help operand folding / the register coalescer.
7197 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7198 MachineBasicBlock *MBB = MI.getParent();
7199 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7200 if (RI.hasVGPRs(DstRC)) {
7201 // Update all the operands so they are VGPR register classes. These may
7202 // not be the same register class because REG_SEQUENCE supports mixing
7203 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7204 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7205 MachineOperand &Op = MI.getOperand(I);
7206 if (!Op.isReg() || !Op.getReg().isVirtual())
7207 continue;
7208
7209 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7210 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7211 if (VRC == OpRC)
7212 continue;
7213
7214 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7215 Op.setIsKill();
7216 }
7217 }
7218
7219 return CreatedBB;
7220 }
7221
7222 // Legalize INSERT_SUBREG
7223 // src0 must have the same register class as dst
7224 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7225 Register Dst = MI.getOperand(0).getReg();
7226 Register Src0 = MI.getOperand(1).getReg();
7227 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7228 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7229 if (DstRC != Src0RC) {
7230 MachineBasicBlock *MBB = MI.getParent();
7231 MachineOperand &Op = MI.getOperand(1);
7232 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7233 }
7234 return CreatedBB;
7235 }
7236
7237 // Legalize SI_INIT_M0
7238 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7239 MachineOperand &Src = MI.getOperand(0);
7240 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7241 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7242 return CreatedBB;
7243 }
7244
7245 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7246 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7247 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7248 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7249 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7250 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7251 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7252 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7253 MachineOperand &Src = MI.getOperand(1);
7254 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7255 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7256 return CreatedBB;
7257 }
7258
7259 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7260 //
7261 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7262 // scratch memory access. In both cases, the legalization never involves
7263 // conversion to the addr64 form.
7265 (isMUBUF(MI) || isMTBUF(MI)))) {
7266 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7267 ? AMDGPU::OpName::rsrc
7268 : AMDGPU::OpName::srsrc;
7269 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7270 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7271 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7272
7273 AMDGPU::OpName SampOpName =
7274 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7275 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7276 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7277 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7278
7279 return CreatedBB;
7280 }
7281
7282 // Legalize SI_CALL
7283 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7284 MachineOperand *Dest = &MI.getOperand(0);
7285 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7286 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7287 // following copies, we also need to move copies from and to physical
7288 // registers into the loop block.
7289 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7290 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7291
7292 // Also move the copies to physical registers into the loop block
7293 MachineBasicBlock &MBB = *MI.getParent();
7295 while (Start->getOpcode() != FrameSetupOpcode)
7296 --Start;
7298 while (End->getOpcode() != FrameDestroyOpcode)
7299 ++End;
7300 // Also include following copies of the return value
7301 ++End;
7302 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7303 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7304 ++End;
7305 CreatedBB =
7306 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7307 }
7308 }
7309
7310 // Legalize s_sleep_var.
7311 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7312 const DebugLoc &DL = MI.getDebugLoc();
7313 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7314 int Src0Idx =
7315 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7316 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7317 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7318 .add(Src0);
7319 Src0.ChangeToRegister(Reg, false);
7320 return nullptr;
7321 }
7322
7323 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7324 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7325 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7326 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7327 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7328 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7329 for (MachineOperand &Src : MI.explicit_operands()) {
7330 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7331 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7332 }
7333 return CreatedBB;
7334 }
7335
7336 // Legalize MUBUF instructions.
7337 bool isSoffsetLegal = true;
7338 int SoffsetIdx =
7339 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7340 if (SoffsetIdx != -1) {
7341 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7342 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7343 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7344 isSoffsetLegal = false;
7345 }
7346 }
7347
7348 bool isRsrcLegal = true;
7349 int RsrcIdx =
7350 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7351 if (RsrcIdx != -1) {
7352 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7353 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7354 isRsrcLegal = false;
7355 }
7356
7357 // The operands are legal.
7358 if (isRsrcLegal && isSoffsetLegal)
7359 return CreatedBB;
7360
7361 if (!isRsrcLegal) {
7362 // Legalize a VGPR Rsrc
7363 //
7364 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7365 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7366 // a zero-value SRsrc.
7367 //
7368 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7369 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7370 // above.
7371 //
7372 // Otherwise we are on non-ADDR64 hardware, and/or we have
7373 // idxen/offen/bothen and we fall back to a waterfall loop.
7374
7375 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7376 MachineBasicBlock &MBB = *MI.getParent();
7377
7378 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7379 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7380 // This is already an ADDR64 instruction so we need to add the pointer
7381 // extracted from the resource descriptor to the current value of VAddr.
7382 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7383 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7384 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7385
7386 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7387 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7388 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7389
7390 unsigned RsrcPtr, NewSRsrc;
7391 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7392
7393 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7394 const DebugLoc &DL = MI.getDebugLoc();
7395 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7396 .addDef(CondReg0)
7397 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7398 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7399 .addImm(0);
7400
7401 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7402 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7403 .addDef(CondReg1, RegState::Dead)
7404 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7405 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7406 .addReg(CondReg0, RegState::Kill)
7407 .addImm(0);
7408
7409 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7410 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7411 .addReg(NewVAddrLo)
7412 .addImm(AMDGPU::sub0)
7413 .addReg(NewVAddrHi)
7414 .addImm(AMDGPU::sub1);
7415
7416 VAddr->setReg(NewVAddr);
7417 Rsrc->setReg(NewSRsrc);
7418 } else if (!VAddr && ST.hasAddr64()) {
7419 // This instructions is the _OFFSET variant, so we need to convert it to
7420 // ADDR64.
7421 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7422 "FIXME: Need to emit flat atomics here");
7423
7424 unsigned RsrcPtr, NewSRsrc;
7425 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7426
7427 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7428 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7429 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7430 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7431 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7432
7433 // Atomics with return have an additional tied operand and are
7434 // missing some of the special bits.
7435 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7436 MachineInstr *Addr64;
7437
7438 if (!VDataIn) {
7439 // Regular buffer load / store.
7441 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7442 .add(*VData)
7443 .addReg(NewVAddr)
7444 .addReg(NewSRsrc)
7445 .add(*SOffset)
7446 .add(*Offset);
7447
7448 if (const MachineOperand *CPol =
7449 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7450 MIB.addImm(CPol->getImm());
7451 }
7452
7453 if (const MachineOperand *TFE =
7454 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7455 MIB.addImm(TFE->getImm());
7456 }
7457
7458 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7459
7460 MIB.cloneMemRefs(MI);
7461 Addr64 = MIB;
7462 } else {
7463 // Atomics with return.
7464 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7465 .add(*VData)
7466 .add(*VDataIn)
7467 .addReg(NewVAddr)
7468 .addReg(NewSRsrc)
7469 .add(*SOffset)
7470 .add(*Offset)
7471 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7472 .cloneMemRefs(MI);
7473 }
7474
7475 MI.removeFromParent();
7476
7477 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7478 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7479 NewVAddr)
7480 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7481 .addImm(AMDGPU::sub0)
7482 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7483 .addImm(AMDGPU::sub1);
7484 } else {
7485 // Legalize a VGPR Rsrc and soffset together.
7486 if (!isSoffsetLegal) {
7487 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7488 CreatedBB =
7489 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7490 return CreatedBB;
7491 }
7492 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7493 return CreatedBB;
7494 }
7495 }
7496
7497 // Legalize a VGPR soffset.
7498 if (!isSoffsetLegal) {
7499 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7500 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7501 return CreatedBB;
7502 }
7503 return CreatedBB;
7504}
7505
7507 InstrList.insert(MI);
7508 // Add MBUF instructiosn to deferred list.
7509 int RsrcIdx =
7510 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7511 if (RsrcIdx != -1) {
7512 DeferredList.insert(MI);
7513 }
7514}
7515
7517 return DeferredList.contains(MI);
7518}
7519
7520// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7521// lowering (change spgr to vgpr).
7522// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7523// size. Need to legalize the size of the operands during the vgpr lowering
7524// chain. This can be removed after we have sgpr16 in place
7526 MachineRegisterInfo &MRI) const {
7527 if (!ST.useRealTrue16Insts())
7528 return;
7529
7530 unsigned Opcode = MI.getOpcode();
7531 MachineBasicBlock *MBB = MI.getParent();
7532 // Legalize operands and check for size mismatch
7533 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7534 OpIdx >= get(Opcode).getNumOperands() ||
7535 get(Opcode).operands()[OpIdx].RegClass == -1)
7536 return;
7537
7538 MachineOperand &Op = MI.getOperand(OpIdx);
7539 if (!Op.isReg() || !Op.getReg().isVirtual())
7540 return;
7541
7542 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7543 if (!RI.isVGPRClass(CurrRC))
7544 return;
7545
7546 unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
7547 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7548 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7549 Op.setSubReg(AMDGPU::lo16);
7550 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7551 const DebugLoc &DL = MI.getDebugLoc();
7552 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7553 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7554 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7555 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7556 .addReg(Op.getReg())
7557 .addImm(AMDGPU::lo16)
7558 .addReg(Undef)
7559 .addImm(AMDGPU::hi16);
7560 Op.setReg(NewDstReg);
7561 }
7562}
7564 MachineRegisterInfo &MRI) const {
7565 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7567}
7568
7570 MachineDominatorTree *MDT) const {
7571
7572 while (!Worklist.empty()) {
7573 MachineInstr &Inst = *Worklist.top();
7574 Worklist.erase_top();
7575 // Skip MachineInstr in the deferred list.
7576 if (Worklist.isDeferred(&Inst))
7577 continue;
7578 moveToVALUImpl(Worklist, MDT, Inst);
7579 }
7580
7581 // Deferred list of instructions will be processed once
7582 // all the MachineInstr in the worklist are done.
7583 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7584 moveToVALUImpl(Worklist, MDT, *Inst);
7585 assert(Worklist.empty() &&
7586 "Deferred MachineInstr are not supposed to re-populate worklist");
7587 }
7588}
7589
7592 MachineInstr &Inst) const {
7593
7595 if (!MBB)
7596 return;
7597 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7598 unsigned Opcode = Inst.getOpcode();
7599 unsigned NewOpcode = getVALUOp(Inst);
7600 // Handle some special cases
7601 switch (Opcode) {
7602 default:
7603 break;
7604 case AMDGPU::S_ADD_I32:
7605 case AMDGPU::S_SUB_I32: {
7606 // FIXME: The u32 versions currently selected use the carry.
7607 bool Changed;
7608 MachineBasicBlock *CreatedBBTmp = nullptr;
7609 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7610 if (Changed)
7611 return;
7612
7613 // Default handling
7614 break;
7615 }
7616
7617 case AMDGPU::S_MUL_U64:
7618 if (ST.hasVectorMulU64()) {
7619 NewOpcode = AMDGPU::V_MUL_U64_e64;
7620 break;
7621 }
7622 // Split s_mul_u64 in 32-bit vector multiplications.
7623 splitScalarSMulU64(Worklist, Inst, MDT);
7624 Inst.eraseFromParent();
7625 return;
7626
7627 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7628 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7629 // This is a special case of s_mul_u64 where all the operands are either
7630 // zero extended or sign extended.
7631 splitScalarSMulPseudo(Worklist, Inst, MDT);
7632 Inst.eraseFromParent();
7633 return;
7634
7635 case AMDGPU::S_AND_B64:
7636 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7637 Inst.eraseFromParent();
7638 return;
7639
7640 case AMDGPU::S_OR_B64:
7641 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7642 Inst.eraseFromParent();
7643 return;
7644
7645 case AMDGPU::S_XOR_B64:
7646 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7647 Inst.eraseFromParent();
7648 return;
7649
7650 case AMDGPU::S_NAND_B64:
7651 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7652 Inst.eraseFromParent();
7653 return;
7654
7655 case AMDGPU::S_NOR_B64:
7656 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7657 Inst.eraseFromParent();
7658 return;
7659
7660 case AMDGPU::S_XNOR_B64:
7661 if (ST.hasDLInsts())
7662 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7663 else
7664 splitScalar64BitXnor(Worklist, Inst, MDT);
7665 Inst.eraseFromParent();
7666 return;
7667
7668 case AMDGPU::S_ANDN2_B64:
7669 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7670 Inst.eraseFromParent();
7671 return;
7672
7673 case AMDGPU::S_ORN2_B64:
7674 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7675 Inst.eraseFromParent();
7676 return;
7677
7678 case AMDGPU::S_BREV_B64:
7679 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7680 Inst.eraseFromParent();
7681 return;
7682
7683 case AMDGPU::S_NOT_B64:
7684 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7685 Inst.eraseFromParent();
7686 return;
7687
7688 case AMDGPU::S_BCNT1_I32_B64:
7689 splitScalar64BitBCNT(Worklist, Inst);
7690 Inst.eraseFromParent();
7691 return;
7692
7693 case AMDGPU::S_BFE_I64:
7694 splitScalar64BitBFE(Worklist, Inst);
7695 Inst.eraseFromParent();
7696 return;
7697
7698 case AMDGPU::S_FLBIT_I32_B64:
7699 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7700 Inst.eraseFromParent();
7701 return;
7702 case AMDGPU::S_FF1_I32_B64:
7703 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7704 Inst.eraseFromParent();
7705 return;
7706
7707 case AMDGPU::S_LSHL_B32:
7708 if (ST.hasOnlyRevVALUShifts()) {
7709 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7710 swapOperands(Inst);
7711 }
7712 break;
7713 case AMDGPU::S_ASHR_I32:
7714 if (ST.hasOnlyRevVALUShifts()) {
7715 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7716 swapOperands(Inst);
7717 }
7718 break;
7719 case AMDGPU::S_LSHR_B32:
7720 if (ST.hasOnlyRevVALUShifts()) {
7721 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7722 swapOperands(Inst);
7723 }
7724 break;
7725 case AMDGPU::S_LSHL_B64:
7726 if (ST.hasOnlyRevVALUShifts()) {
7727 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7728 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7729 : AMDGPU::V_LSHLREV_B64_e64;
7730 swapOperands(Inst);
7731 }
7732 break;
7733 case AMDGPU::S_ASHR_I64:
7734 if (ST.hasOnlyRevVALUShifts()) {
7735 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7736 swapOperands(Inst);
7737 }
7738 break;
7739 case AMDGPU::S_LSHR_B64:
7740 if (ST.hasOnlyRevVALUShifts()) {
7741 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7742 swapOperands(Inst);
7743 }
7744 break;
7745
7746 case AMDGPU::S_ABS_I32:
7747 lowerScalarAbs(Worklist, Inst);
7748 Inst.eraseFromParent();
7749 return;
7750
7751 case AMDGPU::S_CBRANCH_SCC0:
7752 case AMDGPU::S_CBRANCH_SCC1: {
7753 // Clear unused bits of vcc
7754 Register CondReg = Inst.getOperand(1).getReg();
7755 bool IsSCC = CondReg == AMDGPU::SCC;
7756 Register VCC = RI.getVCC();
7757 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7758 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7759 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7760 .addReg(EXEC)
7761 .addReg(IsSCC ? VCC : CondReg);
7762 Inst.removeOperand(1);
7763 } break;
7764
7765 case AMDGPU::S_BFE_U64:
7766 case AMDGPU::S_BFM_B64:
7767 llvm_unreachable("Moving this op to VALU not implemented");
7768
7769 case AMDGPU::S_PACK_LL_B32_B16:
7770 case AMDGPU::S_PACK_LH_B32_B16:
7771 case AMDGPU::S_PACK_HL_B32_B16:
7772 case AMDGPU::S_PACK_HH_B32_B16:
7773 movePackToVALU(Worklist, MRI, Inst);
7774 Inst.eraseFromParent();
7775 return;
7776
7777 case AMDGPU::S_XNOR_B32:
7778 lowerScalarXnor(Worklist, Inst);
7779 Inst.eraseFromParent();
7780 return;
7781
7782 case AMDGPU::S_NAND_B32:
7783 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7784 Inst.eraseFromParent();
7785 return;
7786
7787 case AMDGPU::S_NOR_B32:
7788 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7789 Inst.eraseFromParent();
7790 return;
7791
7792 case AMDGPU::S_ANDN2_B32:
7793 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7794 Inst.eraseFromParent();
7795 return;
7796
7797 case AMDGPU::S_ORN2_B32:
7798 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7799 Inst.eraseFromParent();
7800 return;
7801
7802 // TODO: remove as soon as everything is ready
7803 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7804 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7805 // can only be selected from the uniform SDNode.
7806 case AMDGPU::S_ADD_CO_PSEUDO:
7807 case AMDGPU::S_SUB_CO_PSEUDO: {
7808 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7809 ? AMDGPU::V_ADDC_U32_e64
7810 : AMDGPU::V_SUBB_U32_e64;
7811 const auto *CarryRC = RI.getWaveMaskRegClass();
7812
7813 Register CarryInReg = Inst.getOperand(4).getReg();
7814 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7815 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7816 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7817 .addReg(CarryInReg);
7818 }
7819
7820 Register CarryOutReg = Inst.getOperand(1).getReg();
7821
7822 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7823 MRI.getRegClass(Inst.getOperand(0).getReg())));
7824 MachineInstr *CarryOp =
7825 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7826 .addReg(CarryOutReg, RegState::Define)
7827 .add(Inst.getOperand(2))
7828 .add(Inst.getOperand(3))
7829 .addReg(CarryInReg)
7830 .addImm(0);
7831 legalizeOperands(*CarryOp);
7832 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7833 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7834 Inst.eraseFromParent();
7835 }
7836 return;
7837 case AMDGPU::S_UADDO_PSEUDO:
7838 case AMDGPU::S_USUBO_PSEUDO: {
7839 const DebugLoc &DL = Inst.getDebugLoc();
7840 MachineOperand &Dest0 = Inst.getOperand(0);
7841 MachineOperand &Dest1 = Inst.getOperand(1);
7842 MachineOperand &Src0 = Inst.getOperand(2);
7843 MachineOperand &Src1 = Inst.getOperand(3);
7844
7845 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7846 ? AMDGPU::V_ADD_CO_U32_e64
7847 : AMDGPU::V_SUB_CO_U32_e64;
7848 const TargetRegisterClass *NewRC =
7849 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7850 Register DestReg = MRI.createVirtualRegister(NewRC);
7851 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7852 .addReg(Dest1.getReg(), RegState::Define)
7853 .add(Src0)
7854 .add(Src1)
7855 .addImm(0); // clamp bit
7856
7857 legalizeOperands(*NewInstr, MDT);
7858 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7859 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7860 Worklist);
7861 Inst.eraseFromParent();
7862 }
7863 return;
7864
7865 case AMDGPU::S_CSELECT_B32:
7866 case AMDGPU::S_CSELECT_B64:
7867 lowerSelect(Worklist, Inst, MDT);
7868 Inst.eraseFromParent();
7869 return;
7870 case AMDGPU::S_CMP_EQ_I32:
7871 case AMDGPU::S_CMP_LG_I32:
7872 case AMDGPU::S_CMP_GT_I32:
7873 case AMDGPU::S_CMP_GE_I32:
7874 case AMDGPU::S_CMP_LT_I32:
7875 case AMDGPU::S_CMP_LE_I32:
7876 case AMDGPU::S_CMP_EQ_U32:
7877 case AMDGPU::S_CMP_LG_U32:
7878 case AMDGPU::S_CMP_GT_U32:
7879 case AMDGPU::S_CMP_GE_U32:
7880 case AMDGPU::S_CMP_LT_U32:
7881 case AMDGPU::S_CMP_LE_U32:
7882 case AMDGPU::S_CMP_EQ_U64:
7883 case AMDGPU::S_CMP_LG_U64:
7884 case AMDGPU::S_CMP_LT_F32:
7885 case AMDGPU::S_CMP_EQ_F32:
7886 case AMDGPU::S_CMP_LE_F32:
7887 case AMDGPU::S_CMP_GT_F32:
7888 case AMDGPU::S_CMP_LG_F32:
7889 case AMDGPU::S_CMP_GE_F32:
7890 case AMDGPU::S_CMP_O_F32:
7891 case AMDGPU::S_CMP_U_F32:
7892 case AMDGPU::S_CMP_NGE_F32:
7893 case AMDGPU::S_CMP_NLG_F32:
7894 case AMDGPU::S_CMP_NGT_F32:
7895 case AMDGPU::S_CMP_NLE_F32:
7896 case AMDGPU::S_CMP_NEQ_F32:
7897 case AMDGPU::S_CMP_NLT_F32: {
7898 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7899 auto NewInstr =
7900 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7901 .setMIFlags(Inst.getFlags());
7902 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
7903 0) {
7904 NewInstr
7905 .addImm(0) // src0_modifiers
7906 .add(Inst.getOperand(0)) // src0
7907 .addImm(0) // src1_modifiers
7908 .add(Inst.getOperand(1)) // src1
7909 .addImm(0); // clamp
7910 } else {
7911 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
7912 }
7913 legalizeOperands(*NewInstr, MDT);
7914 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7915 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7916 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7917 Inst.eraseFromParent();
7918 return;
7919 }
7920 case AMDGPU::S_CMP_LT_F16:
7921 case AMDGPU::S_CMP_EQ_F16:
7922 case AMDGPU::S_CMP_LE_F16:
7923 case AMDGPU::S_CMP_GT_F16:
7924 case AMDGPU::S_CMP_LG_F16:
7925 case AMDGPU::S_CMP_GE_F16:
7926 case AMDGPU::S_CMP_O_F16:
7927 case AMDGPU::S_CMP_U_F16:
7928 case AMDGPU::S_CMP_NGE_F16:
7929 case AMDGPU::S_CMP_NLG_F16:
7930 case AMDGPU::S_CMP_NGT_F16:
7931 case AMDGPU::S_CMP_NLE_F16:
7932 case AMDGPU::S_CMP_NEQ_F16:
7933 case AMDGPU::S_CMP_NLT_F16: {
7934 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7935 auto NewInstr =
7936 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7937 .setMIFlags(Inst.getFlags());
7938 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
7939 NewInstr
7940 .addImm(0) // src0_modifiers
7941 .add(Inst.getOperand(0)) // src0
7942 .addImm(0) // src1_modifiers
7943 .add(Inst.getOperand(1)) // src1
7944 .addImm(0); // clamp
7945 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
7946 NewInstr.addImm(0); // op_sel0
7947 } else {
7948 NewInstr
7949 .add(Inst.getOperand(0))
7950 .add(Inst.getOperand(1));
7951 }
7952 legalizeOperandsVALUt16(*NewInstr, MRI);
7953 legalizeOperands(*NewInstr, MDT);
7954 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7955 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7956 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7957 Inst.eraseFromParent();
7958 return;
7959 }
7960 case AMDGPU::S_CVT_HI_F32_F16: {
7961 const DebugLoc &DL = Inst.getDebugLoc();
7962 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7963 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7964 if (ST.useRealTrue16Insts()) {
7965 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
7966 .add(Inst.getOperand(1));
7967 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7968 .addImm(0) // src0_modifiers
7969 .addReg(TmpReg, 0, AMDGPU::hi16)
7970 .addImm(0) // clamp
7971 .addImm(0) // omod
7972 .addImm(0); // op_sel0
7973 } else {
7974 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7975 .addImm(16)
7976 .add(Inst.getOperand(1));
7977 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7978 .addImm(0) // src0_modifiers
7979 .addReg(TmpReg)
7980 .addImm(0) // clamp
7981 .addImm(0); // omod
7982 }
7983
7984 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7985 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7986 Inst.eraseFromParent();
7987 return;
7988 }
7989 case AMDGPU::S_MINIMUM_F32:
7990 case AMDGPU::S_MAXIMUM_F32: {
7991 const DebugLoc &DL = Inst.getDebugLoc();
7992 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7993 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7994 .addImm(0) // src0_modifiers
7995 .add(Inst.getOperand(1))
7996 .addImm(0) // src1_modifiers
7997 .add(Inst.getOperand(2))
7998 .addImm(0) // clamp
7999 .addImm(0); // omod
8000 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8001
8002 legalizeOperands(*NewInstr, MDT);
8003 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8004 Inst.eraseFromParent();
8005 return;
8006 }
8007 case AMDGPU::S_MINIMUM_F16:
8008 case AMDGPU::S_MAXIMUM_F16: {
8009 const DebugLoc &DL = Inst.getDebugLoc();
8010 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8011 ? &AMDGPU::VGPR_16RegClass
8012 : &AMDGPU::VGPR_32RegClass);
8013 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8014 .addImm(0) // src0_modifiers
8015 .add(Inst.getOperand(1))
8016 .addImm(0) // src1_modifiers
8017 .add(Inst.getOperand(2))
8018 .addImm(0) // clamp
8019 .addImm(0) // omod
8020 .addImm(0); // opsel0
8021 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8022 legalizeOperandsVALUt16(*NewInstr, MRI);
8023 legalizeOperands(*NewInstr, MDT);
8024 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8025 Inst.eraseFromParent();
8026 return;
8027 }
8028 case AMDGPU::V_S_EXP_F16_e64:
8029 case AMDGPU::V_S_LOG_F16_e64:
8030 case AMDGPU::V_S_RCP_F16_e64:
8031 case AMDGPU::V_S_RSQ_F16_e64:
8032 case AMDGPU::V_S_SQRT_F16_e64: {
8033 const DebugLoc &DL = Inst.getDebugLoc();
8034 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8035 ? &AMDGPU::VGPR_16RegClass
8036 : &AMDGPU::VGPR_32RegClass);
8037 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8038 .add(Inst.getOperand(1)) // src0_modifiers
8039 .add(Inst.getOperand(2))
8040 .add(Inst.getOperand(3)) // clamp
8041 .add(Inst.getOperand(4)) // omod
8042 .setMIFlags(Inst.getFlags());
8043 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8044 NewInstr.addImm(0); // opsel0
8045 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8046 legalizeOperandsVALUt16(*NewInstr, MRI);
8047 legalizeOperands(*NewInstr, MDT);
8048 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8049 Inst.eraseFromParent();
8050 return;
8051 }
8052 }
8053
8054 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8055 // We cannot move this instruction to the VALU, so we should try to
8056 // legalize its operands instead.
8057 legalizeOperands(Inst, MDT);
8058 return;
8059 }
8060 // Handle converting generic instructions like COPY-to-SGPR into
8061 // COPY-to-VGPR.
8062 if (NewOpcode == Opcode) {
8063 Register DstReg = Inst.getOperand(0).getReg();
8064 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8065
8066 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8067 // hope for the best.
8068 if (Inst.isCopy() && DstReg.isPhysical() &&
8069 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8070 // TODO: Only works for 32 bit registers.
8071 if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) {
8072 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8073 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
8074 .add(Inst.getOperand(1));
8075 } else {
8076 Register NewDst =
8077 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8078 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8079 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8080 .add(Inst.getOperand(1));
8081 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8082 DstReg)
8083 .addReg(NewDst);
8084 }
8085 Inst.eraseFromParent();
8086 return;
8087 }
8088
8089 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
8090 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
8091 // Instead of creating a copy where src and dst are the same register
8092 // class, we just replace all uses of dst with src. These kinds of
8093 // copies interfere with the heuristics MachineSink uses to decide
8094 // whether or not to split a critical edge. Since the pass assumes
8095 // that copies will end up as machine instructions and not be
8096 // eliminated.
8097 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8098 Register NewDstReg = Inst.getOperand(1).getReg();
8099 MRI.replaceRegWith(DstReg, NewDstReg);
8100 MRI.clearKillFlags(NewDstReg);
8101 Inst.getOperand(0).setReg(DstReg);
8102 Inst.eraseFromParent();
8103 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8104 for (MachineOperand &MO :
8105 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8106 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8107 }
8108 return;
8109 }
8110
8111 // If this is a v2s copy between 16bit and 32bit reg,
8112 // replace vgpr copy to reg_sequence/extract_subreg
8113 // This can be remove after we have sgpr16 in place
8114 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8115 Inst.getOperand(1).getReg().isVirtual() &&
8116 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8117 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8118 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8119 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8120 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8121 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8122 get(AMDGPU::IMPLICIT_DEF), Undef);
8123 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8124 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8125 .addReg(Inst.getOperand(1).getReg())
8126 .addImm(AMDGPU::lo16)
8127 .addReg(Undef)
8128 .addImm(AMDGPU::hi16);
8129 Inst.eraseFromParent();
8130 MRI.replaceRegWith(DstReg, NewDstReg);
8131 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8132 return;
8133 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8134 AMDGPU::lo16)) {
8135 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8136 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8137 MRI.replaceRegWith(DstReg, NewDstReg);
8138 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8139 return;
8140 }
8141 }
8142
8143 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8144 MRI.replaceRegWith(DstReg, NewDstReg);
8145 legalizeOperands(Inst, MDT);
8146 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8147 return;
8148 }
8149
8150 // Use the new VALU Opcode.
8151 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8152 .setMIFlags(Inst.getFlags());
8153 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8154 // Intersperse VOP3 modifiers among the SALU operands.
8155 NewInstr->addOperand(Inst.getOperand(0));
8156 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8157 AMDGPU::OpName::src0_modifiers) >= 0)
8158 NewInstr.addImm(0);
8159 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8160 MachineOperand Src = Inst.getOperand(1);
8161 NewInstr->addOperand(Src);
8162 }
8163
8164 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8165 // We are converting these to a BFE, so we need to add the missing
8166 // operands for the size and offset.
8167 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8168 NewInstr.addImm(0);
8169 NewInstr.addImm(Size);
8170 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8171 // The VALU version adds the second operand to the result, so insert an
8172 // extra 0 operand.
8173 NewInstr.addImm(0);
8174 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8175 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8176 // If we need to move this to VGPRs, we need to unpack the second
8177 // operand back into the 2 separate ones for bit offset and width.
8178 assert(OffsetWidthOp.isImm() &&
8179 "Scalar BFE is only implemented for constant width and offset");
8180 uint32_t Imm = OffsetWidthOp.getImm();
8181
8182 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8183 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8184 NewInstr.addImm(Offset);
8185 NewInstr.addImm(BitWidth);
8186 } else {
8187 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8188 AMDGPU::OpName::src1_modifiers) >= 0)
8189 NewInstr.addImm(0);
8190 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8191 NewInstr->addOperand(Inst.getOperand(2));
8192 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8193 AMDGPU::OpName::src2_modifiers) >= 0)
8194 NewInstr.addImm(0);
8195 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8196 NewInstr->addOperand(Inst.getOperand(3));
8197 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8198 NewInstr.addImm(0);
8199 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8200 NewInstr.addImm(0);
8201 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8202 NewInstr.addImm(0);
8203 }
8204 } else {
8205 // Just copy the SALU operands.
8206 for (const MachineOperand &Op : Inst.explicit_operands())
8207 NewInstr->addOperand(Op);
8208 }
8209
8210 // Remove any references to SCC. Vector instructions can't read from it, and
8211 // We're just about to add the implicit use / defs of VCC, and we don't want
8212 // both.
8213 for (MachineOperand &Op : Inst.implicit_operands()) {
8214 if (Op.getReg() == AMDGPU::SCC) {
8215 // Only propagate through live-def of SCC.
8216 if (Op.isDef() && !Op.isDead())
8217 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8218 if (Op.isUse())
8219 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8220 }
8221 }
8222 Inst.eraseFromParent();
8223 Register NewDstReg;
8224 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8225 Register DstReg = NewInstr->getOperand(0).getReg();
8226 assert(DstReg.isVirtual());
8227 // Update the destination register class.
8228 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8229 assert(NewDstRC);
8230 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8231 MRI.replaceRegWith(DstReg, NewDstReg);
8232 }
8233 fixImplicitOperands(*NewInstr);
8234
8235 legalizeOperandsVALUt16(*NewInstr, MRI);
8236
8237 // Legalize the operands
8238 legalizeOperands(*NewInstr, MDT);
8239 if (NewDstReg)
8240 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8241}
8242
8243// Add/sub require special handling to deal with carry outs.
8244std::pair<bool, MachineBasicBlock *>
8245SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8246 MachineDominatorTree *MDT) const {
8247 if (ST.hasAddNoCarry()) {
8248 // Assume there is no user of scc since we don't select this in that case.
8249 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8250 // is used.
8251
8252 MachineBasicBlock &MBB = *Inst.getParent();
8253 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8254
8255 Register OldDstReg = Inst.getOperand(0).getReg();
8256 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8257
8258 unsigned Opc = Inst.getOpcode();
8259 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8260
8261 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8262 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8263
8264 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8265 Inst.removeOperand(3);
8266
8267 Inst.setDesc(get(NewOpc));
8268 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8269 Inst.addImplicitDefUseOperands(*MBB.getParent());
8270 MRI.replaceRegWith(OldDstReg, ResultReg);
8271 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8272
8273 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8274 return std::pair(true, NewBB);
8275 }
8276
8277 return std::pair(false, nullptr);
8278}
8279
8280void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8281 MachineDominatorTree *MDT) const {
8282
8283 MachineBasicBlock &MBB = *Inst.getParent();
8284 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8285 MachineBasicBlock::iterator MII = Inst;
8286 DebugLoc DL = Inst.getDebugLoc();
8287
8288 MachineOperand &Dest = Inst.getOperand(0);
8289 MachineOperand &Src0 = Inst.getOperand(1);
8290 MachineOperand &Src1 = Inst.getOperand(2);
8291 MachineOperand &Cond = Inst.getOperand(3);
8292
8293 Register CondReg = Cond.getReg();
8294 bool IsSCC = (CondReg == AMDGPU::SCC);
8295
8296 // If this is a trivial select where the condition is effectively not SCC
8297 // (CondReg is a source of copy to SCC), then the select is semantically
8298 // equivalent to copying CondReg. Hence, there is no need to create
8299 // V_CNDMASK, we can just use that and bail out.
8300 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8301 (Src1.getImm() == 0)) {
8302 MRI.replaceRegWith(Dest.getReg(), CondReg);
8303 return;
8304 }
8305
8306 Register NewCondReg = CondReg;
8307 if (IsSCC) {
8308 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8309 NewCondReg = MRI.createVirtualRegister(TC);
8310
8311 // Now look for the closest SCC def if it is a copy
8312 // replacing the CondReg with the COPY source register
8313 bool CopyFound = false;
8314 for (MachineInstr &CandI :
8316 Inst.getParent()->rend())) {
8317 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8318 -1) {
8319 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8320 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8321 .addReg(CandI.getOperand(1).getReg());
8322 CopyFound = true;
8323 }
8324 break;
8325 }
8326 }
8327 if (!CopyFound) {
8328 // SCC def is not a copy
8329 // Insert a trivial select instead of creating a copy, because a copy from
8330 // SCC would semantically mean just copying a single bit, but we may need
8331 // the result to be a vector condition mask that needs preserving.
8332 unsigned Opcode =
8333 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8334 auto NewSelect =
8335 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8336 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8337 }
8338 }
8339
8340 Register NewDestReg = MRI.createVirtualRegister(
8341 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8342 MachineInstr *NewInst;
8343 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8344 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8345 .addImm(0)
8346 .add(Src1) // False
8347 .addImm(0)
8348 .add(Src0) // True
8349 .addReg(NewCondReg);
8350 } else {
8351 NewInst =
8352 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8353 .add(Src1) // False
8354 .add(Src0) // True
8355 .addReg(NewCondReg);
8356 }
8357 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8358 legalizeOperands(*NewInst, MDT);
8359 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8360}
8361
8362void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8363 MachineInstr &Inst) const {
8364 MachineBasicBlock &MBB = *Inst.getParent();
8365 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8366 MachineBasicBlock::iterator MII = Inst;
8367 DebugLoc DL = Inst.getDebugLoc();
8368
8369 MachineOperand &Dest = Inst.getOperand(0);
8370 MachineOperand &Src = Inst.getOperand(1);
8371 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8372 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8373
8374 unsigned SubOp = ST.hasAddNoCarry() ?
8375 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8376
8377 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8378 .addImm(0)
8379 .addReg(Src.getReg());
8380
8381 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8382 .addReg(Src.getReg())
8383 .addReg(TmpReg);
8384
8385 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8386 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8387}
8388
8389void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8390 MachineInstr &Inst) const {
8391 MachineBasicBlock &MBB = *Inst.getParent();
8392 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8393 MachineBasicBlock::iterator MII = Inst;
8394 const DebugLoc &DL = Inst.getDebugLoc();
8395
8396 MachineOperand &Dest = Inst.getOperand(0);
8397 MachineOperand &Src0 = Inst.getOperand(1);
8398 MachineOperand &Src1 = Inst.getOperand(2);
8399
8400 if (ST.hasDLInsts()) {
8401 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8402 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8403 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8404
8405 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8406 .add(Src0)
8407 .add(Src1);
8408
8409 MRI.replaceRegWith(Dest.getReg(), NewDest);
8410 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8411 } else {
8412 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8413 // invert either source and then perform the XOR. If either source is a
8414 // scalar register, then we can leave the inversion on the scalar unit to
8415 // achieve a better distribution of scalar and vector instructions.
8416 bool Src0IsSGPR = Src0.isReg() &&
8417 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8418 bool Src1IsSGPR = Src1.isReg() &&
8419 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8420 MachineInstr *Xor;
8421 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8422 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8423
8424 // Build a pair of scalar instructions and add them to the work list.
8425 // The next iteration over the work list will lower these to the vector
8426 // unit as necessary.
8427 if (Src0IsSGPR) {
8428 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8429 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8430 .addReg(Temp)
8431 .add(Src1);
8432 } else if (Src1IsSGPR) {
8433 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8434 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8435 .add(Src0)
8436 .addReg(Temp);
8437 } else {
8438 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8439 .add(Src0)
8440 .add(Src1);
8441 MachineInstr *Not =
8442 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8443 Worklist.insert(Not);
8444 }
8445
8446 MRI.replaceRegWith(Dest.getReg(), NewDest);
8447
8448 Worklist.insert(Xor);
8449
8450 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8451 }
8452}
8453
8454void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8455 MachineInstr &Inst,
8456 unsigned Opcode) const {
8457 MachineBasicBlock &MBB = *Inst.getParent();
8458 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8459 MachineBasicBlock::iterator MII = Inst;
8460 const DebugLoc &DL = Inst.getDebugLoc();
8461
8462 MachineOperand &Dest = Inst.getOperand(0);
8463 MachineOperand &Src0 = Inst.getOperand(1);
8464 MachineOperand &Src1 = Inst.getOperand(2);
8465
8466 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8467 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8468
8469 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8470 .add(Src0)
8471 .add(Src1);
8472
8473 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8474 .addReg(Interm);
8475
8476 Worklist.insert(&Op);
8477 Worklist.insert(&Not);
8478
8479 MRI.replaceRegWith(Dest.getReg(), NewDest);
8480 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8481}
8482
8483void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8484 MachineInstr &Inst,
8485 unsigned Opcode) const {
8486 MachineBasicBlock &MBB = *Inst.getParent();
8487 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8488 MachineBasicBlock::iterator MII = Inst;
8489 const DebugLoc &DL = Inst.getDebugLoc();
8490
8491 MachineOperand &Dest = Inst.getOperand(0);
8492 MachineOperand &Src0 = Inst.getOperand(1);
8493 MachineOperand &Src1 = Inst.getOperand(2);
8494
8495 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8496 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8497
8498 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8499 .add(Src1);
8500
8501 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8502 .add(Src0)
8503 .addReg(Interm);
8504
8505 Worklist.insert(&Not);
8506 Worklist.insert(&Op);
8507
8508 MRI.replaceRegWith(Dest.getReg(), NewDest);
8509 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8510}
8511
8512void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8513 MachineInstr &Inst, unsigned Opcode,
8514 bool Swap) const {
8515 MachineBasicBlock &MBB = *Inst.getParent();
8516 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8517
8518 MachineOperand &Dest = Inst.getOperand(0);
8519 MachineOperand &Src0 = Inst.getOperand(1);
8520 DebugLoc DL = Inst.getDebugLoc();
8521
8522 MachineBasicBlock::iterator MII = Inst;
8523
8524 const MCInstrDesc &InstDesc = get(Opcode);
8525 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8526 MRI.getRegClass(Src0.getReg()) :
8527 &AMDGPU::SGPR_32RegClass;
8528
8529 const TargetRegisterClass *Src0SubRC =
8530 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8531
8532 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8533 AMDGPU::sub0, Src0SubRC);
8534
8535 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8536 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8537 const TargetRegisterClass *NewDestSubRC =
8538 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8539
8540 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8541 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8542
8543 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8544 AMDGPU::sub1, Src0SubRC);
8545
8546 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8547 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8548
8549 if (Swap)
8550 std::swap(DestSub0, DestSub1);
8551
8552 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8553 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8554 .addReg(DestSub0)
8555 .addImm(AMDGPU::sub0)
8556 .addReg(DestSub1)
8557 .addImm(AMDGPU::sub1);
8558
8559 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8560
8561 Worklist.insert(&LoHalf);
8562 Worklist.insert(&HiHalf);
8563
8564 // We don't need to legalizeOperands here because for a single operand, src0
8565 // will support any kind of input.
8566
8567 // Move all users of this moved value.
8568 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8569}
8570
8571// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8572// split the s_mul_u64 in 32-bit vector multiplications.
8573void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8574 MachineInstr &Inst,
8575 MachineDominatorTree *MDT) const {
8576 MachineBasicBlock &MBB = *Inst.getParent();
8577 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8578
8579 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8580 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8581 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8582
8583 MachineOperand &Dest = Inst.getOperand(0);
8584 MachineOperand &Src0 = Inst.getOperand(1);
8585 MachineOperand &Src1 = Inst.getOperand(2);
8586 const DebugLoc &DL = Inst.getDebugLoc();
8587 MachineBasicBlock::iterator MII = Inst;
8588
8589 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8590 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8591 const TargetRegisterClass *Src0SubRC =
8592 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8593 if (RI.isSGPRClass(Src0SubRC))
8594 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8595 const TargetRegisterClass *Src1SubRC =
8596 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8597 if (RI.isSGPRClass(Src1SubRC))
8598 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8599
8600 // First, we extract the low 32-bit and high 32-bit values from each of the
8601 // operands.
8602 MachineOperand Op0L =
8603 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8604 MachineOperand Op1L =
8605 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8606 MachineOperand Op0H =
8607 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8608 MachineOperand Op1H =
8609 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8610
8611 // The multilication is done as follows:
8612 //
8613 // Op1H Op1L
8614 // * Op0H Op0L
8615 // --------------------
8616 // Op1H*Op0L Op1L*Op0L
8617 // + Op1H*Op0H Op1L*Op0H
8618 // -----------------------------------------
8619 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8620 //
8621 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8622 // value and that would overflow.
8623 // The low 32-bit value is Op1L*Op0L.
8624 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8625
8626 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8627 MachineInstr *Op1L_Op0H =
8628 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8629 .add(Op1L)
8630 .add(Op0H);
8631
8632 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8633 MachineInstr *Op1H_Op0L =
8634 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8635 .add(Op1H)
8636 .add(Op0L);
8637
8638 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8639 MachineInstr *Carry =
8640 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8641 .add(Op1L)
8642 .add(Op0L);
8643
8644 MachineInstr *LoHalf =
8645 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8646 .add(Op1L)
8647 .add(Op0L);
8648
8649 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8650 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8651 .addReg(Op1L_Op0H_Reg)
8652 .addReg(Op1H_Op0L_Reg);
8653
8654 MachineInstr *HiHalf =
8655 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8656 .addReg(AddReg)
8657 .addReg(CarryReg);
8658
8659 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8660 .addReg(DestSub0)
8661 .addImm(AMDGPU::sub0)
8662 .addReg(DestSub1)
8663 .addImm(AMDGPU::sub1);
8664
8665 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8666
8667 // Try to legalize the operands in case we need to swap the order to keep it
8668 // valid.
8669 legalizeOperands(*Op1L_Op0H, MDT);
8670 legalizeOperands(*Op1H_Op0L, MDT);
8671 legalizeOperands(*Carry, MDT);
8672 legalizeOperands(*LoHalf, MDT);
8673 legalizeOperands(*Add, MDT);
8674 legalizeOperands(*HiHalf, MDT);
8675
8676 // Move all users of this moved value.
8677 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8678}
8679
8680// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8681// multiplications.
8682void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8683 MachineInstr &Inst,
8684 MachineDominatorTree *MDT) const {
8685 MachineBasicBlock &MBB = *Inst.getParent();
8686 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8687
8688 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8689 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8690 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8691
8692 MachineOperand &Dest = Inst.getOperand(0);
8693 MachineOperand &Src0 = Inst.getOperand(1);
8694 MachineOperand &Src1 = Inst.getOperand(2);
8695 const DebugLoc &DL = Inst.getDebugLoc();
8696 MachineBasicBlock::iterator MII = Inst;
8697
8698 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8699 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8700 const TargetRegisterClass *Src0SubRC =
8701 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8702 if (RI.isSGPRClass(Src0SubRC))
8703 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8704 const TargetRegisterClass *Src1SubRC =
8705 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8706 if (RI.isSGPRClass(Src1SubRC))
8707 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8708
8709 // First, we extract the low 32-bit and high 32-bit values from each of the
8710 // operands.
8711 MachineOperand Op0L =
8712 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8713 MachineOperand Op1L =
8714 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8715
8716 unsigned Opc = Inst.getOpcode();
8717 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8718 ? AMDGPU::V_MUL_HI_U32_e64
8719 : AMDGPU::V_MUL_HI_I32_e64;
8720 MachineInstr *HiHalf =
8721 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8722
8723 MachineInstr *LoHalf =
8724 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8725 .add(Op1L)
8726 .add(Op0L);
8727
8728 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8729 .addReg(DestSub0)
8730 .addImm(AMDGPU::sub0)
8731 .addReg(DestSub1)
8732 .addImm(AMDGPU::sub1);
8733
8734 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8735
8736 // Try to legalize the operands in case we need to swap the order to keep it
8737 // valid.
8738 legalizeOperands(*HiHalf, MDT);
8739 legalizeOperands(*LoHalf, MDT);
8740
8741 // Move all users of this moved value.
8742 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8743}
8744
8745void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8746 MachineInstr &Inst, unsigned Opcode,
8747 MachineDominatorTree *MDT) const {
8748 MachineBasicBlock &MBB = *Inst.getParent();
8749 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8750
8751 MachineOperand &Dest = Inst.getOperand(0);
8752 MachineOperand &Src0 = Inst.getOperand(1);
8753 MachineOperand &Src1 = Inst.getOperand(2);
8754 DebugLoc DL = Inst.getDebugLoc();
8755
8756 MachineBasicBlock::iterator MII = Inst;
8757
8758 const MCInstrDesc &InstDesc = get(Opcode);
8759 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8760 MRI.getRegClass(Src0.getReg()) :
8761 &AMDGPU::SGPR_32RegClass;
8762
8763 const TargetRegisterClass *Src0SubRC =
8764 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8765 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8766 MRI.getRegClass(Src1.getReg()) :
8767 &AMDGPU::SGPR_32RegClass;
8768
8769 const TargetRegisterClass *Src1SubRC =
8770 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8771
8772 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8773 AMDGPU::sub0, Src0SubRC);
8774 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8775 AMDGPU::sub0, Src1SubRC);
8776 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8777 AMDGPU::sub1, Src0SubRC);
8778 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8779 AMDGPU::sub1, Src1SubRC);
8780
8781 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8782 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8783 const TargetRegisterClass *NewDestSubRC =
8784 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8785
8786 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8787 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8788 .add(SrcReg0Sub0)
8789 .add(SrcReg1Sub0);
8790
8791 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8792 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8793 .add(SrcReg0Sub1)
8794 .add(SrcReg1Sub1);
8795
8796 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8797 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8798 .addReg(DestSub0)
8799 .addImm(AMDGPU::sub0)
8800 .addReg(DestSub1)
8801 .addImm(AMDGPU::sub1);
8802
8803 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8804
8805 Worklist.insert(&LoHalf);
8806 Worklist.insert(&HiHalf);
8807
8808 // Move all users of this moved value.
8809 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8810}
8811
8812void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8813 MachineInstr &Inst,
8814 MachineDominatorTree *MDT) const {
8815 MachineBasicBlock &MBB = *Inst.getParent();
8816 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8817
8818 MachineOperand &Dest = Inst.getOperand(0);
8819 MachineOperand &Src0 = Inst.getOperand(1);
8820 MachineOperand &Src1 = Inst.getOperand(2);
8821 const DebugLoc &DL = Inst.getDebugLoc();
8822
8823 MachineBasicBlock::iterator MII = Inst;
8824
8825 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8826
8827 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8828
8829 MachineOperand* Op0;
8830 MachineOperand* Op1;
8831
8832 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8833 Op0 = &Src0;
8834 Op1 = &Src1;
8835 } else {
8836 Op0 = &Src1;
8837 Op1 = &Src0;
8838 }
8839
8840 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8841 .add(*Op0);
8842
8843 Register NewDest = MRI.createVirtualRegister(DestRC);
8844
8845 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8846 .addReg(Interm)
8847 .add(*Op1);
8848
8849 MRI.replaceRegWith(Dest.getReg(), NewDest);
8850
8851 Worklist.insert(&Xor);
8852}
8853
8854void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8855 MachineInstr &Inst) const {
8856 MachineBasicBlock &MBB = *Inst.getParent();
8857 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8858
8859 MachineBasicBlock::iterator MII = Inst;
8860 const DebugLoc &DL = Inst.getDebugLoc();
8861
8862 MachineOperand &Dest = Inst.getOperand(0);
8863 MachineOperand &Src = Inst.getOperand(1);
8864
8865 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8866 const TargetRegisterClass *SrcRC = Src.isReg() ?
8867 MRI.getRegClass(Src.getReg()) :
8868 &AMDGPU::SGPR_32RegClass;
8869
8870 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8871 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8872
8873 const TargetRegisterClass *SrcSubRC =
8874 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8875
8876 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8877 AMDGPU::sub0, SrcSubRC);
8878 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8879 AMDGPU::sub1, SrcSubRC);
8880
8881 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8882
8883 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8884
8885 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8886
8887 // We don't need to legalize operands here. src0 for either instruction can be
8888 // an SGPR, and the second input is unused or determined here.
8889 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8890}
8891
8892void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8893 MachineInstr &Inst) const {
8894 MachineBasicBlock &MBB = *Inst.getParent();
8895 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8896 MachineBasicBlock::iterator MII = Inst;
8897 const DebugLoc &DL = Inst.getDebugLoc();
8898
8899 MachineOperand &Dest = Inst.getOperand(0);
8900 uint32_t Imm = Inst.getOperand(2).getImm();
8901 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8902 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8903
8904 (void) Offset;
8905
8906 // Only sext_inreg cases handled.
8907 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8908 Offset == 0 && "Not implemented");
8909
8910 if (BitWidth < 32) {
8911 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8912 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8913 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8914
8915 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8916 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8917 .addImm(0)
8918 .addImm(BitWidth);
8919
8920 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8921 .addImm(31)
8922 .addReg(MidRegLo);
8923
8924 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8925 .addReg(MidRegLo)
8926 .addImm(AMDGPU::sub0)
8927 .addReg(MidRegHi)
8928 .addImm(AMDGPU::sub1);
8929
8930 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8931 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8932 return;
8933 }
8934
8935 MachineOperand &Src = Inst.getOperand(1);
8936 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8937 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8938
8939 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8940 .addImm(31)
8941 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8942
8943 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8944 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8945 .addImm(AMDGPU::sub0)
8946 .addReg(TmpReg)
8947 .addImm(AMDGPU::sub1);
8948
8949 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8950 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8951}
8952
8953void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8954 MachineInstr &Inst, unsigned Opcode,
8955 MachineDominatorTree *MDT) const {
8956 // (S_FLBIT_I32_B64 hi:lo) ->
8957 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8958 // (S_FF1_I32_B64 hi:lo) ->
8959 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8960
8961 MachineBasicBlock &MBB = *Inst.getParent();
8962 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8963 MachineBasicBlock::iterator MII = Inst;
8964 const DebugLoc &DL = Inst.getDebugLoc();
8965
8966 MachineOperand &Dest = Inst.getOperand(0);
8967 MachineOperand &Src = Inst.getOperand(1);
8968
8969 const MCInstrDesc &InstDesc = get(Opcode);
8970
8971 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8972 unsigned OpcodeAdd =
8973 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8974
8975 const TargetRegisterClass *SrcRC =
8976 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8977 const TargetRegisterClass *SrcSubRC =
8978 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8979
8980 MachineOperand SrcRegSub0 =
8981 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8982 MachineOperand SrcRegSub1 =
8983 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8984
8985 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8986 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8987 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8988 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8989
8990 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8991
8992 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8993
8994 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8995 .addReg(IsCtlz ? MidReg1 : MidReg2)
8996 .addImm(32)
8997 .addImm(1); // enable clamp
8998
8999 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9000 .addReg(MidReg3)
9001 .addReg(IsCtlz ? MidReg2 : MidReg1);
9002
9003 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9004
9005 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9006}
9007
9008void SIInstrInfo::addUsersToMoveToVALUWorklist(
9010 SIInstrWorklist &Worklist) const {
9011 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9012 MachineInstr &UseMI = *MO.getParent();
9013
9014 unsigned OpNo = 0;
9015
9016 switch (UseMI.getOpcode()) {
9017 case AMDGPU::COPY:
9018 case AMDGPU::WQM:
9019 case AMDGPU::SOFT_WQM:
9020 case AMDGPU::STRICT_WWM:
9021 case AMDGPU::STRICT_WQM:
9022 case AMDGPU::REG_SEQUENCE:
9023 case AMDGPU::PHI:
9024 case AMDGPU::INSERT_SUBREG:
9025 break;
9026 default:
9027 OpNo = MO.getOperandNo();
9028 break;
9029 }
9030
9031 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo)))
9032 Worklist.insert(&UseMI);
9033 else
9034 // Legalization could change user list.
9036 }
9037}
9038
9039void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9041 MachineInstr &Inst) const {
9042 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9043 MachineBasicBlock *MBB = Inst.getParent();
9044 MachineOperand &Src0 = Inst.getOperand(1);
9045 MachineOperand &Src1 = Inst.getOperand(2);
9046 const DebugLoc &DL = Inst.getDebugLoc();
9047
9048 switch (Inst.getOpcode()) {
9049 case AMDGPU::S_PACK_LL_B32_B16: {
9050 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9051 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9052
9053 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9054 // 0.
9055 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9056 .addImm(0xffff);
9057
9058 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9059 .addReg(ImmReg, RegState::Kill)
9060 .add(Src0);
9061
9062 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9063 .add(Src1)
9064 .addImm(16)
9065 .addReg(TmpReg, RegState::Kill);
9066 break;
9067 }
9068 case AMDGPU::S_PACK_LH_B32_B16: {
9069 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9070 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9071 .addImm(0xffff);
9072 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9073 .addReg(ImmReg, RegState::Kill)
9074 .add(Src0)
9075 .add(Src1);
9076 break;
9077 }
9078 case AMDGPU::S_PACK_HL_B32_B16: {
9079 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9080 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9081 .addImm(16)
9082 .add(Src0);
9083 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9084 .add(Src1)
9085 .addImm(16)
9086 .addReg(TmpReg, RegState::Kill);
9087 break;
9088 }
9089 case AMDGPU::S_PACK_HH_B32_B16: {
9090 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9091 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9092 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9093 .addImm(16)
9094 .add(Src0);
9095 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9096 .addImm(0xffff0000);
9097 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9098 .add(Src1)
9099 .addReg(ImmReg, RegState::Kill)
9100 .addReg(TmpReg, RegState::Kill);
9101 break;
9102 }
9103 default:
9104 llvm_unreachable("unhandled s_pack_* instruction");
9105 }
9106
9107 MachineOperand &Dest = Inst.getOperand(0);
9108 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9109 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9110}
9111
9112void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
9113 MachineInstr &SCCDefInst,
9114 SIInstrWorklist &Worklist,
9115 Register NewCond) const {
9116
9117 // Ensure that def inst defines SCC, which is still live.
9118 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9119 !Op.isDead() && Op.getParent() == &SCCDefInst);
9120 SmallVector<MachineInstr *, 4> CopyToDelete;
9121 // This assumes that all the users of SCC are in the same block
9122 // as the SCC def.
9123 for (MachineInstr &MI : // Skip the def inst itself.
9124 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9125 SCCDefInst.getParent()->end())) {
9126 // Check if SCC is used first.
9127 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9128 if (SCCIdx != -1) {
9129 if (MI.isCopy()) {
9130 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9131 Register DestReg = MI.getOperand(0).getReg();
9132
9133 MRI.replaceRegWith(DestReg, NewCond);
9134 CopyToDelete.push_back(&MI);
9135 } else {
9136
9137 if (NewCond.isValid())
9138 MI.getOperand(SCCIdx).setReg(NewCond);
9139
9140 Worklist.insert(&MI);
9141 }
9142 }
9143 // Exit if we find another SCC def.
9144 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9145 break;
9146 }
9147 for (auto &Copy : CopyToDelete)
9148 Copy->eraseFromParent();
9149}
9150
9151// Instructions that use SCC may be converted to VALU instructions. When that
9152// happens, the SCC register is changed to VCC_LO. The instruction that defines
9153// SCC must be changed to an instruction that defines VCC. This function makes
9154// sure that the instruction that defines SCC is added to the moveToVALU
9155// worklist.
9156void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9157 SIInstrWorklist &Worklist) const {
9158 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9159 // then there is nothing to do because the defining instruction has been
9160 // converted to a VALU already. If SCC then that instruction needs to be
9161 // converted to a VALU.
9162 for (MachineInstr &MI :
9163 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9164 SCCUseInst->getParent()->rend())) {
9165 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9166 break;
9167 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9168 Worklist.insert(&MI);
9169 break;
9170 }
9171 }
9172}
9173
9174const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9175 const MachineInstr &Inst) const {
9176 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9177
9178 switch (Inst.getOpcode()) {
9179 // For target instructions, getOpRegClass just returns the virtual register
9180 // class associated with the operand, so we need to find an equivalent VGPR
9181 // register class in order to move the instruction to the VALU.
9182 case AMDGPU::COPY:
9183 case AMDGPU::PHI:
9184 case AMDGPU::REG_SEQUENCE:
9185 case AMDGPU::INSERT_SUBREG:
9186 case AMDGPU::WQM:
9187 case AMDGPU::SOFT_WQM:
9188 case AMDGPU::STRICT_WWM:
9189 case AMDGPU::STRICT_WQM: {
9190 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9191 if (RI.isAGPRClass(SrcRC)) {
9192 if (RI.isAGPRClass(NewDstRC))
9193 return nullptr;
9194
9195 switch (Inst.getOpcode()) {
9196 case AMDGPU::PHI:
9197 case AMDGPU::REG_SEQUENCE:
9198 case AMDGPU::INSERT_SUBREG:
9199 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9200 break;
9201 default:
9202 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9203 }
9204
9205 if (!NewDstRC)
9206 return nullptr;
9207 } else {
9208 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9209 return nullptr;
9210
9211 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9212 if (!NewDstRC)
9213 return nullptr;
9214 }
9215
9216 return NewDstRC;
9217 }
9218 default:
9219 return NewDstRC;
9220 }
9221}
9222
9223// Find the one SGPR operand we are allowed to use.
9224Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9225 int OpIndices[3]) const {
9226 const MCInstrDesc &Desc = MI.getDesc();
9227
9228 // Find the one SGPR operand we are allowed to use.
9229 //
9230 // First we need to consider the instruction's operand requirements before
9231 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9232 // of VCC, but we are still bound by the constant bus requirement to only use
9233 // one.
9234 //
9235 // If the operand's class is an SGPR, we can never move it.
9236
9237 Register SGPRReg = findImplicitSGPRRead(MI);
9238 if (SGPRReg)
9239 return SGPRReg;
9240
9241 Register UsedSGPRs[3] = {Register()};
9242 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9243
9244 for (unsigned i = 0; i < 3; ++i) {
9245 int Idx = OpIndices[i];
9246 if (Idx == -1)
9247 break;
9248
9249 const MachineOperand &MO = MI.getOperand(Idx);
9250 if (!MO.isReg())
9251 continue;
9252
9253 // Is this operand statically required to be an SGPR based on the operand
9254 // constraints?
9255 const TargetRegisterClass *OpRC =
9256 RI.getRegClass(Desc.operands()[Idx].RegClass);
9257 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9258 if (IsRequiredSGPR)
9259 return MO.getReg();
9260
9261 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9262 Register Reg = MO.getReg();
9263 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9264 if (RI.isSGPRClass(RegRC))
9265 UsedSGPRs[i] = Reg;
9266 }
9267
9268 // We don't have a required SGPR operand, so we have a bit more freedom in
9269 // selecting operands to move.
9270
9271 // Try to select the most used SGPR. If an SGPR is equal to one of the
9272 // others, we choose that.
9273 //
9274 // e.g.
9275 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9276 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9277
9278 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9279 // prefer those.
9280
9281 if (UsedSGPRs[0]) {
9282 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9283 SGPRReg = UsedSGPRs[0];
9284 }
9285
9286 if (!SGPRReg && UsedSGPRs[1]) {
9287 if (UsedSGPRs[1] == UsedSGPRs[2])
9288 SGPRReg = UsedSGPRs[1];
9289 }
9290
9291 return SGPRReg;
9292}
9293
9295 AMDGPU::OpName OperandName) const {
9296 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9297 return nullptr;
9298
9299 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9300 if (Idx == -1)
9301 return nullptr;
9302
9303 return &MI.getOperand(Idx);
9304}
9305
9307 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9308 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9311 return (Format << 44) |
9312 (1ULL << 56) | // RESOURCE_LEVEL = 1
9313 (3ULL << 60); // OOB_SELECT = 3
9314 }
9315
9316 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9317 if (ST.isAmdHsaOS()) {
9318 // Set ATC = 1. GFX9 doesn't have this bit.
9319 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9320 RsrcDataFormat |= (1ULL << 56);
9321
9322 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9323 // BTW, it disables TC L2 and therefore decreases performance.
9324 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9325 RsrcDataFormat |= (2ULL << 59);
9326 }
9327
9328 return RsrcDataFormat;
9329}
9330
9334 0xffffffff; // Size;
9335
9336 // GFX9 doesn't have ELEMENT_SIZE.
9337 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9338 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9339 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9340 }
9341
9342 // IndexStride = 64 / 32.
9343 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9344 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9345
9346 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9347 // Clear them unless we want a huge stride.
9348 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9349 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9350 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9351
9352 return Rsrc23;
9353}
9354
9356 unsigned Opc = MI.getOpcode();
9357
9358 return isSMRD(Opc);
9359}
9360
9362 return get(Opc).mayLoad() &&
9363 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9364}
9365
9367 int &FrameIndex) const {
9368 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9369 if (!Addr || !Addr->isFI())
9370 return Register();
9371
9372 assert(!MI.memoperands_empty() &&
9373 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9374
9375 FrameIndex = Addr->getIndex();
9376 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9377}
9378
9380 int &FrameIndex) const {
9381 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9382 assert(Addr && Addr->isFI());
9383 FrameIndex = Addr->getIndex();
9384 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9385}
9386
9388 int &FrameIndex) const {
9389 if (!MI.mayLoad())
9390 return Register();
9391
9392 if (isMUBUF(MI) || isVGPRSpill(MI))
9393 return isStackAccess(MI, FrameIndex);
9394
9395 if (isSGPRSpill(MI))
9396 return isSGPRStackAccess(MI, FrameIndex);
9397
9398 return Register();
9399}
9400
9402 int &FrameIndex) const {
9403 if (!MI.mayStore())
9404 return Register();
9405
9406 if (isMUBUF(MI) || isVGPRSpill(MI))
9407 return isStackAccess(MI, FrameIndex);
9408
9409 if (isSGPRSpill(MI))
9410 return isSGPRStackAccess(MI, FrameIndex);
9411
9412 return Register();
9413}
9414
9416 unsigned Size = 0;
9418 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9419 while (++I != E && I->isInsideBundle()) {
9420 assert(!I->isBundle() && "No nested bundle!");
9422 }
9423
9424 return Size;
9425}
9426
9428 unsigned Opc = MI.getOpcode();
9430 unsigned DescSize = Desc.getSize();
9431
9432 // If we have a definitive size, we can use it. Otherwise we need to inspect
9433 // the operands to know the size.
9434 if (isFixedSize(MI)) {
9435 unsigned Size = DescSize;
9436
9437 // If we hit the buggy offset, an extra nop will be inserted in MC so
9438 // estimate the worst case.
9439 if (MI.isBranch() && ST.hasOffset3fBug())
9440 Size += 4;
9441
9442 return Size;
9443 }
9444
9445 // Instructions may have a 32-bit literal encoded after them. Check
9446 // operands that could ever be literals.
9447 if (isVALU(MI) || isSALU(MI)) {
9448 if (isDPP(MI))
9449 return DescSize;
9450 bool HasLiteral = false;
9451 unsigned LiteralSize = 4;
9452 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9453 const MachineOperand &Op = MI.getOperand(I);
9454 const MCOperandInfo &OpInfo = Desc.operands()[I];
9455 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9456 HasLiteral = true;
9457 if (ST.has64BitLiterals()) {
9458 switch (OpInfo.OperandType) {
9459 default:
9460 break;
9462 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9463 LiteralSize = 8;
9464 break;
9466 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9467 LiteralSize = 8;
9468 break;
9469 }
9470 }
9471 break;
9472 }
9473 }
9474 return HasLiteral ? DescSize + LiteralSize : DescSize;
9475 }
9476
9477 // Check whether we have extra NSA words.
9478 if (isMIMG(MI)) {
9479 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9480 if (VAddr0Idx < 0)
9481 return 8;
9482
9483 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9484 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9485 }
9486
9487 switch (Opc) {
9488 case TargetOpcode::BUNDLE:
9489 return getInstBundleSize(MI);
9490 case TargetOpcode::INLINEASM:
9491 case TargetOpcode::INLINEASM_BR: {
9492 const MachineFunction *MF = MI.getParent()->getParent();
9493 const char *AsmStr = MI.getOperand(0).getSymbolName();
9494 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9495 }
9496 default:
9497 if (MI.isMetaInstruction())
9498 return 0;
9499
9500 // If D16 Pseudo inst, get correct MC code size
9501 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9502 if (D16Info) {
9503 // Assume d16_lo/hi inst are always in same size
9504 unsigned LoInstOpcode = D16Info->LoOp;
9505 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9506 DescSize = Desc.getSize();
9507 }
9508
9509 return DescSize;
9510 }
9511}
9512
9514 if (!isFLAT(MI))
9515 return false;
9516
9517 if (MI.memoperands_empty())
9518 return true;
9519
9520 for (const MachineMemOperand *MMO : MI.memoperands()) {
9521 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9522 return true;
9523 }
9524 return false;
9525}
9526
9529 static const std::pair<int, const char *> TargetIndices[] = {
9530 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9531 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9532 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9533 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9534 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9535 return ArrayRef(TargetIndices);
9536}
9537
9538/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9539/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9545
9546/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9547/// pass.
9552
9553// Called during:
9554// - pre-RA scheduling and post-RA scheduling
9557 const ScheduleDAGMI *DAG) const {
9558 // Borrowed from Arm Target
9559 // We would like to restrict this hazard recognizer to only
9560 // post-RA scheduling; we can tell that we're post-RA because we don't
9561 // track VRegLiveness.
9562 if (!DAG->hasVRegLiveness())
9563 return new GCNHazardRecognizer(DAG->MF);
9565}
9566
9567std::pair<unsigned, unsigned>
9569 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9570}
9571
9574 static const std::pair<unsigned, const char *> TargetFlags[] = {
9575 {MO_GOTPCREL, "amdgpu-gotprel"},
9576 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9577 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9578 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9579 {MO_REL32_LO, "amdgpu-rel32-lo"},
9580 {MO_REL32_HI, "amdgpu-rel32-hi"},
9581 {MO_REL64, "amdgpu-rel64"},
9582 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9583 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9584 {MO_ABS64, "amdgpu-abs64"},
9585 };
9586
9587 return ArrayRef(TargetFlags);
9588}
9589
9592 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9593 {
9594 {MONoClobber, "amdgpu-noclobber"},
9595 {MOLastUse, "amdgpu-last-use"},
9596 {MOCooperative, "amdgpu-cooperative"},
9597 };
9598
9599 return ArrayRef(TargetFlags);
9600}
9601
9603 const MachineFunction &MF) const {
9605 assert(SrcReg.isVirtual());
9606 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9607 return AMDGPU::WWM_COPY;
9608
9609 return AMDGPU::COPY;
9610}
9611
9613 Register Reg) const {
9614 // We need to handle instructions which may be inserted during register
9615 // allocation to handle the prolog. The initial prolog instruction may have
9616 // been separated from the start of the block by spills and copies inserted
9617 // needed by the prolog. However, the insertions for scalar registers can
9618 // always be placed at the BB top as they are independent of the exec mask
9619 // value.
9620 const MachineFunction *MF = MI.getParent()->getParent();
9621 bool IsNullOrVectorRegister = true;
9622 if (Reg) {
9623 const MachineRegisterInfo &MRI = MF->getRegInfo();
9624 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9625 }
9626
9627 uint16_t Opcode = MI.getOpcode();
9629 return IsNullOrVectorRegister &&
9630 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9631 (Opcode == AMDGPU::IMPLICIT_DEF &&
9632 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
9633 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9634 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9635}
9636
9640 const DebugLoc &DL,
9641 Register DestReg) const {
9642 if (ST.hasAddNoCarry())
9643 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9644
9645 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9646 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9647 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9648
9649 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9650 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9651}
9652
9655 const DebugLoc &DL,
9656 Register DestReg,
9657 RegScavenger &RS) const {
9658 if (ST.hasAddNoCarry())
9659 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9660
9661 // If available, prefer to use vcc.
9662 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9663 ? Register(RI.getVCC())
9665 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9666 0, /* AllowSpill */ false);
9667
9668 // TODO: Users need to deal with this.
9669 if (!UnusedCarry.isValid())
9670 return MachineInstrBuilder();
9671
9672 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9673 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9674}
9675
9676bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9677 switch (Opcode) {
9678 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9679 case AMDGPU::SI_KILL_I1_TERMINATOR:
9680 return true;
9681 default:
9682 return false;
9683 }
9684}
9685
9687 switch (Opcode) {
9688 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9689 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9690 case AMDGPU::SI_KILL_I1_PSEUDO:
9691 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9692 default:
9693 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9694 }
9695}
9696
9697bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9698 return Imm <= getMaxMUBUFImmOffset(ST);
9699}
9700
9702 // GFX12 field is non-negative 24-bit signed byte offset.
9703 const unsigned OffsetBits =
9704 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9705 return (1 << OffsetBits) - 1;
9706}
9707
9709 if (!ST.isWave32())
9710 return;
9711
9712 if (MI.isInlineAsm())
9713 return;
9714
9715 for (auto &Op : MI.implicit_operands()) {
9716 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9717 Op.setReg(AMDGPU::VCC_LO);
9718 }
9719}
9720
9722 if (!isSMRD(MI))
9723 return false;
9724
9725 // Check that it is using a buffer resource.
9726 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9727 if (Idx == -1) // e.g. s_memtime
9728 return false;
9729
9730 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9731 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9732}
9733
9734// Given Imm, split it into the values to put into the SOffset and ImmOffset
9735// fields in an MUBUF instruction. Return false if it is not possible (due to a
9736// hardware bug needing a workaround).
9737//
9738// The required alignment ensures that individual address components remain
9739// aligned if they are aligned to begin with. It also ensures that additional
9740// offsets within the given alignment can be added to the resulting ImmOffset.
9742 uint32_t &ImmOffset, Align Alignment) const {
9743 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9744 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9745 uint32_t Overflow = 0;
9746
9747 if (Imm > MaxImm) {
9748 if (Imm <= MaxImm + 64) {
9749 // Use an SOffset inline constant for 4..64
9750 Overflow = Imm - MaxImm;
9751 Imm = MaxImm;
9752 } else {
9753 // Try to keep the same value in SOffset for adjacent loads, so that
9754 // the corresponding register contents can be re-used.
9755 //
9756 // Load values with all low-bits (except for alignment bits) set into
9757 // SOffset, so that a larger range of values can be covered using
9758 // s_movk_i32.
9759 //
9760 // Atomic operations fail to work correctly when individual address
9761 // components are unaligned, even if their sum is aligned.
9762 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9763 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9764 Imm = Low;
9765 Overflow = High - Alignment.value();
9766 }
9767 }
9768
9769 if (Overflow > 0) {
9770 // There is a hardware bug in SI and CI which prevents address clamping in
9771 // MUBUF instructions from working correctly with SOffsets. The immediate
9772 // offset is unaffected.
9773 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
9774 return false;
9775
9776 // It is not possible to set immediate in SOffset field on some targets.
9777 if (ST.hasRestrictedSOffset())
9778 return false;
9779 }
9780
9781 ImmOffset = Imm;
9782 SOffset = Overflow;
9783 return true;
9784}
9785
9786// Depending on the used address space and instructions, some immediate offsets
9787// are allowed and some are not.
9788// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9789// scratch instruction offsets can also be negative. On GFX12, offsets can be
9790// negative for all variants.
9791//
9792// There are several bugs related to these offsets:
9793// On gfx10.1, flat instructions that go into the global address space cannot
9794// use an offset.
9795//
9796// For scratch instructions, the address can be either an SGPR or a VGPR.
9797// The following offsets can be used, depending on the architecture (x means
9798// cannot be used):
9799// +----------------------------+------+------+
9800// | Address-Mode | SGPR | VGPR |
9801// +----------------------------+------+------+
9802// | gfx9 | | |
9803// | negative, 4-aligned offset | x | ok |
9804// | negative, unaligned offset | x | ok |
9805// +----------------------------+------+------+
9806// | gfx10 | | |
9807// | negative, 4-aligned offset | ok | ok |
9808// | negative, unaligned offset | ok | x |
9809// +----------------------------+------+------+
9810// | gfx10.3 | | |
9811// | negative, 4-aligned offset | ok | ok |
9812// | negative, unaligned offset | ok | ok |
9813// +----------------------------+------+------+
9814//
9815// This function ignores the addressing mode, so if an offset cannot be used in
9816// one addressing mode, it is considered illegal.
9817bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9818 uint64_t FlatVariant) const {
9819 // TODO: Should 0 be special cased?
9820 if (!ST.hasFlatInstOffsets())
9821 return false;
9822
9823 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9824 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9825 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9826 return false;
9827
9828 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9829 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9830 (Offset % 4) != 0) {
9831 return false;
9832 }
9833
9834 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9835 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9836 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9837}
9838
9839// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9840std::pair<int64_t, int64_t>
9841SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9842 uint64_t FlatVariant) const {
9843 int64_t RemainderOffset = COffsetVal;
9844 int64_t ImmField = 0;
9845
9846 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9847 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9848
9849 if (AllowNegative) {
9850 // Use signed division by a power of two to truncate towards 0.
9851 int64_t D = 1LL << NumBits;
9852 RemainderOffset = (COffsetVal / D) * D;
9853 ImmField = COffsetVal - RemainderOffset;
9854
9855 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9856 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9857 (ImmField % 4) != 0) {
9858 // Make ImmField a multiple of 4
9859 RemainderOffset += ImmField % 4;
9860 ImmField -= ImmField % 4;
9861 }
9862 } else if (COffsetVal >= 0) {
9863 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9864 RemainderOffset = COffsetVal - ImmField;
9865 }
9866
9867 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9868 assert(RemainderOffset + ImmField == COffsetVal);
9869 return {ImmField, RemainderOffset};
9870}
9871
9873 if (ST.hasNegativeScratchOffsetBug() &&
9874 FlatVariant == SIInstrFlags::FlatScratch)
9875 return false;
9876
9877 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9878}
9879
9880static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9881 switch (ST.getGeneration()) {
9882 default:
9883 break;
9886 return SIEncodingFamily::SI;
9889 return SIEncodingFamily::VI;
9895 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
9897 }
9898 llvm_unreachable("Unknown subtarget generation!");
9899}
9900
9901bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9902 switch(MCOp) {
9903 // These opcodes use indirect register addressing so
9904 // they need special handling by codegen (currently missing).
9905 // Therefore it is too risky to allow these opcodes
9906 // to be selected by dpp combiner or sdwa peepholer.
9907 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9908 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9909 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9910 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9911 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9912 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9913 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9914 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9915 return true;
9916 default:
9917 return false;
9918 }
9919}
9920
9921#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
9922 case OPCODE##_dpp: \
9923 case OPCODE##_e32: \
9924 case OPCODE##_e64: \
9925 case OPCODE##_e64_dpp: \
9926 case OPCODE##_sdwa:
9927
9928static bool isRenamedInGFX9(int Opcode) {
9929 switch (Opcode) {
9930 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
9931 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
9932 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
9933 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
9934 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
9935 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
9936 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
9937 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
9938 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
9939 //
9940 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
9941 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
9942 case AMDGPU::V_FMA_F16_gfx9_e64:
9943 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
9944 case AMDGPU::V_INTERP_P2_F16:
9945 case AMDGPU::V_MAD_F16_e64:
9946 case AMDGPU::V_MAD_U16_e64:
9947 case AMDGPU::V_MAD_I16_e64:
9948 return true;
9949 default:
9950 return false;
9951 }
9952}
9953
9954int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9955 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9956
9957 unsigned Gen = subtargetEncodingFamily(ST);
9958
9959 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
9961
9962 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9963 // subtarget has UnpackedD16VMem feature.
9964 // TODO: remove this when we discard GFX80 encoding.
9965 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9967
9968 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9969 switch (ST.getGeneration()) {
9970 default:
9972 break;
9975 break;
9978 break;
9979 }
9980 }
9981
9982 if (isMAI(Opcode)) {
9983 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9984 if (MFMAOp != -1)
9985 Opcode = MFMAOp;
9986 }
9987
9988 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9989
9990 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
9992
9993 // -1 means that Opcode is already a native instruction.
9994 if (MCOp == -1)
9995 return Opcode;
9996
9997 if (ST.hasGFX90AInsts()) {
9998 uint16_t NMCOp = (uint16_t)-1;
9999 if (ST.hasGFX940Insts())
10001 if (NMCOp == (uint16_t)-1)
10003 if (NMCOp == (uint16_t)-1)
10005 if (NMCOp != (uint16_t)-1)
10006 MCOp = NMCOp;
10007 }
10008
10009 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
10010 // no encoding in the given subtarget generation.
10011 if (MCOp == (uint16_t)-1)
10012 return -1;
10013
10014 if (isAsmOnlyOpcode(MCOp))
10015 return -1;
10016
10017 return MCOp;
10018}
10019
10020static
10022 assert(RegOpnd.isReg());
10023 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10024 getRegSubRegPair(RegOpnd);
10025}
10026
10029 assert(MI.isRegSequence());
10030 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10031 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10032 auto &RegOp = MI.getOperand(1 + 2 * I);
10033 return getRegOrUndef(RegOp);
10034 }
10036}
10037
10038// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10039// Following a subreg of reg:subreg isn't supported
10042 if (!RSR.SubReg)
10043 return false;
10044 switch (MI.getOpcode()) {
10045 default: break;
10046 case AMDGPU::REG_SEQUENCE:
10047 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10048 return true;
10049 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10050 case AMDGPU::INSERT_SUBREG:
10051 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10052 // inserted the subreg we're looking for
10053 RSR = getRegOrUndef(MI.getOperand(2));
10054 else { // the subreg in the rest of the reg
10055 auto R1 = getRegOrUndef(MI.getOperand(1));
10056 if (R1.SubReg) // subreg of subreg isn't supported
10057 return false;
10058 RSR.Reg = R1.Reg;
10059 }
10060 return true;
10061 }
10062 return false;
10063}
10064
10067 assert(MRI.isSSA());
10068 if (!P.Reg.isVirtual())
10069 return nullptr;
10070
10071 auto RSR = P;
10072 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10073 while (auto *MI = DefInst) {
10074 DefInst = nullptr;
10075 switch (MI->getOpcode()) {
10076 case AMDGPU::COPY:
10077 case AMDGPU::V_MOV_B32_e32: {
10078 auto &Op1 = MI->getOperand(1);
10079 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10080 if (Op1.isUndef())
10081 return nullptr;
10082 RSR = getRegSubRegPair(Op1);
10083 DefInst = MRI.getVRegDef(RSR.Reg);
10084 }
10085 break;
10086 }
10087 default:
10088 if (followSubRegDef(*MI, RSR)) {
10089 if (!RSR.Reg)
10090 return nullptr;
10091 DefInst = MRI.getVRegDef(RSR.Reg);
10092 }
10093 }
10094 if (!DefInst)
10095 return MI;
10096 }
10097 return nullptr;
10098}
10099
10101 Register VReg,
10102 const MachineInstr &DefMI,
10103 const MachineInstr &UseMI) {
10104 assert(MRI.isSSA() && "Must be run on SSA");
10105
10106 auto *TRI = MRI.getTargetRegisterInfo();
10107 auto *DefBB = DefMI.getParent();
10108
10109 // Don't bother searching between blocks, although it is possible this block
10110 // doesn't modify exec.
10111 if (UseMI.getParent() != DefBB)
10112 return true;
10113
10114 const int MaxInstScan = 20;
10115 int NumInst = 0;
10116
10117 // Stop scan at the use.
10118 auto E = UseMI.getIterator();
10119 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10120 if (I->isDebugInstr())
10121 continue;
10122
10123 if (++NumInst > MaxInstScan)
10124 return true;
10125
10126 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10127 return true;
10128 }
10129
10130 return false;
10131}
10132
10134 Register VReg,
10135 const MachineInstr &DefMI) {
10136 assert(MRI.isSSA() && "Must be run on SSA");
10137
10138 auto *TRI = MRI.getTargetRegisterInfo();
10139 auto *DefBB = DefMI.getParent();
10140
10141 const int MaxUseScan = 10;
10142 int NumUse = 0;
10143
10144 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10145 auto &UseInst = *Use.getParent();
10146 // Don't bother searching between blocks, although it is possible this block
10147 // doesn't modify exec.
10148 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10149 return true;
10150
10151 if (++NumUse > MaxUseScan)
10152 return true;
10153 }
10154
10155 if (NumUse == 0)
10156 return false;
10157
10158 const int MaxInstScan = 20;
10159 int NumInst = 0;
10160
10161 // Stop scan when we have seen all the uses.
10162 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10163 assert(I != DefBB->end());
10164
10165 if (I->isDebugInstr())
10166 continue;
10167
10168 if (++NumInst > MaxInstScan)
10169 return true;
10170
10171 for (const MachineOperand &Op : I->operands()) {
10172 // We don't check reg masks here as they're used only on calls:
10173 // 1. EXEC is only considered const within one BB
10174 // 2. Call should be a terminator instruction if present in a BB
10175
10176 if (!Op.isReg())
10177 continue;
10178
10179 Register Reg = Op.getReg();
10180 if (Op.isUse()) {
10181 if (Reg == VReg && --NumUse == 0)
10182 return false;
10183 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10184 return true;
10185 }
10186 }
10187}
10188
10191 const DebugLoc &DL, Register Src, Register Dst) const {
10192 auto Cur = MBB.begin();
10193 if (Cur != MBB.end())
10194 do {
10195 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10196 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10197 ++Cur;
10198 } while (Cur != MBB.end() && Cur != LastPHIIt);
10199
10200 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10201 Dst);
10202}
10203
10206 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10207 if (InsPt != MBB.end() &&
10208 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10209 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10210 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10211 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10212 InsPt++;
10213 return BuildMI(MBB, InsPt, DL,
10214 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
10215 : AMDGPU::S_MOV_B64_term),
10216 Dst)
10217 .addReg(Src, 0, SrcSubReg)
10218 .addReg(AMDGPU::EXEC, RegState::Implicit);
10219 }
10220 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10221 Dst);
10222}
10223
10224bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10225
10228 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10229 VirtRegMap *VRM) const {
10230 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10231 //
10232 // %0:sreg_32 = COPY $m0
10233 //
10234 // We explicitly chose SReg_32 for the virtual register so such a copy might
10235 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10236 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10237 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10238 // TargetInstrInfo::foldMemoryOperand() is going to try.
10239 // A similar issue also exists with spilling and reloading $exec registers.
10240 //
10241 // To prevent that, constrain the %0 register class here.
10242 if (isFullCopyInstr(MI)) {
10243 Register DstReg = MI.getOperand(0).getReg();
10244 Register SrcReg = MI.getOperand(1).getReg();
10245 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10246 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10248 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10249 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10250 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10251 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10252 return nullptr;
10253 }
10254 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10255 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10256 return nullptr;
10257 }
10258 }
10259 }
10260
10261 return nullptr;
10262}
10263
10265 const MachineInstr &MI,
10266 unsigned *PredCost) const {
10267 if (MI.isBundle()) {
10269 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10270 unsigned Lat = 0, Count = 0;
10271 for (++I; I != E && I->isBundledWithPred(); ++I) {
10272 ++Count;
10273 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10274 }
10275 return Lat + Count - 1;
10276 }
10277
10278 return SchedModel.computeInstrLatency(&MI);
10279}
10280
10283 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10284 unsigned opcode = MI.getOpcode();
10285
10286 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10287 Register Dst = MI.getOperand(0).getReg();
10288 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10289 : MI.getOperand(1).getReg();
10290 LLT DstTy = MRI.getType(Dst);
10291 LLT SrcTy = MRI.getType(Src);
10292 unsigned DstAS = DstTy.getAddressSpace();
10293 unsigned SrcAS = SrcTy.getAddressSpace();
10294 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10295 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10296 ST.hasGloballyAddressableScratch()
10299 };
10300
10301 // If the target supports globally addressable scratch, the mapping from
10302 // scratch memory to the flat aperture changes therefore an address space cast
10303 // is no longer uniform.
10304 if (opcode == TargetOpcode::G_ADDRSPACE_CAST)
10305 return HandleAddrSpaceCast(MI);
10306
10307 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10308 auto IID = GI->getIntrinsicID();
10313
10314 switch (IID) {
10315 case Intrinsic::amdgcn_addrspacecast_nonnull:
10316 return HandleAddrSpaceCast(MI);
10317 case Intrinsic::amdgcn_if:
10318 case Intrinsic::amdgcn_else:
10319 // FIXME: Uniform if second result
10320 break;
10321 }
10322
10324 }
10325
10326 // Loads from the private and flat address spaces are divergent, because
10327 // threads can execute the load instruction with the same inputs and get
10328 // different results.
10329 //
10330 // All other loads are not divergent, because if threads issue loads with the
10331 // same arguments, they will always get the same result.
10332 if (opcode == AMDGPU::G_LOAD) {
10333 if (MI.memoperands_empty())
10334 return InstructionUniformity::NeverUniform; // conservative assumption
10335
10336 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10337 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10338 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10339 })) {
10340 // At least one MMO in a non-global address space.
10342 }
10344 }
10345
10346 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
10347 opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10348 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10349 AMDGPU::isGenericAtomic(opcode)) {
10351 }
10353}
10354
10357
10358 if (isNeverUniform(MI))
10360
10361 unsigned opcode = MI.getOpcode();
10362 if (opcode == AMDGPU::V_READLANE_B32 ||
10363 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10364 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10366
10367 if (isCopyInstr(MI)) {
10368 const MachineOperand &srcOp = MI.getOperand(1);
10369 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10370 const TargetRegisterClass *regClass =
10371 RI.getPhysRegBaseClass(srcOp.getReg());
10372 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10374 }
10376 }
10377
10378 // GMIR handling
10379 if (MI.isPreISelOpcode())
10381
10382 // Atomics are divergent because they are executed sequentially: when an
10383 // atomic operation refers to the same address in each thread, then each
10384 // thread after the first sees the value written by the previous thread as
10385 // original value.
10386
10387 if (isAtomic(MI))
10389
10390 // Loads from the private and flat address spaces are divergent, because
10391 // threads can execute the load instruction with the same inputs and get
10392 // different results.
10393 if (isFLAT(MI) && MI.mayLoad()) {
10394 if (MI.memoperands_empty())
10395 return InstructionUniformity::NeverUniform; // conservative assumption
10396
10397 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10398 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10399 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10400 })) {
10401 // At least one MMO in a non-global address space.
10403 }
10404
10406 }
10407
10408 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
10409 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10410
10411 // FIXME: It's conceptually broken to report this for an instruction, and not
10412 // a specific def operand. For inline asm in particular, there could be mixed
10413 // uniform and divergent results.
10414 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10415 const MachineOperand &SrcOp = MI.getOperand(I);
10416 if (!SrcOp.isReg())
10417 continue;
10418
10419 Register Reg = SrcOp.getReg();
10420 if (!Reg || !SrcOp.readsReg())
10421 continue;
10422
10423 // If RegBank is null, this is unassigned or an unallocatable special
10424 // register, which are all scalars.
10425 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10426 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10428 }
10429
10430 // TODO: Uniformity check condtions above can be rearranged for more
10431 // redability
10432
10433 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10434 // currently turned into no-op COPYs by SelectionDAG ISel and are
10435 // therefore no longer recognizable.
10436
10438}
10439
10441 switch (MF.getFunction().getCallingConv()) {
10443 return 1;
10445 return 2;
10447 return 3;
10451 const Function &F = MF.getFunction();
10452 F.getContext().diagnose(DiagnosticInfoUnsupported(
10453 F, "ds_ordered_count unsupported for this calling conv"));
10454 [[fallthrough]];
10455 }
10458 case CallingConv::C:
10459 case CallingConv::Fast:
10460 default:
10461 // Assume other calling conventions are various compute callable functions
10462 return 0;
10463 }
10464}
10465
10467 Register &SrcReg2, int64_t &CmpMask,
10468 int64_t &CmpValue) const {
10469 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10470 return false;
10471
10472 switch (MI.getOpcode()) {
10473 default:
10474 break;
10475 case AMDGPU::S_CMP_EQ_U32:
10476 case AMDGPU::S_CMP_EQ_I32:
10477 case AMDGPU::S_CMP_LG_U32:
10478 case AMDGPU::S_CMP_LG_I32:
10479 case AMDGPU::S_CMP_LT_U32:
10480 case AMDGPU::S_CMP_LT_I32:
10481 case AMDGPU::S_CMP_GT_U32:
10482 case AMDGPU::S_CMP_GT_I32:
10483 case AMDGPU::S_CMP_LE_U32:
10484 case AMDGPU::S_CMP_LE_I32:
10485 case AMDGPU::S_CMP_GE_U32:
10486 case AMDGPU::S_CMP_GE_I32:
10487 case AMDGPU::S_CMP_EQ_U64:
10488 case AMDGPU::S_CMP_LG_U64:
10489 SrcReg = MI.getOperand(0).getReg();
10490 if (MI.getOperand(1).isReg()) {
10491 if (MI.getOperand(1).getSubReg())
10492 return false;
10493 SrcReg2 = MI.getOperand(1).getReg();
10494 CmpValue = 0;
10495 } else if (MI.getOperand(1).isImm()) {
10496 SrcReg2 = Register();
10497 CmpValue = MI.getOperand(1).getImm();
10498 } else {
10499 return false;
10500 }
10501 CmpMask = ~0;
10502 return true;
10503 case AMDGPU::S_CMPK_EQ_U32:
10504 case AMDGPU::S_CMPK_EQ_I32:
10505 case AMDGPU::S_CMPK_LG_U32:
10506 case AMDGPU::S_CMPK_LG_I32:
10507 case AMDGPU::S_CMPK_LT_U32:
10508 case AMDGPU::S_CMPK_LT_I32:
10509 case AMDGPU::S_CMPK_GT_U32:
10510 case AMDGPU::S_CMPK_GT_I32:
10511 case AMDGPU::S_CMPK_LE_U32:
10512 case AMDGPU::S_CMPK_LE_I32:
10513 case AMDGPU::S_CMPK_GE_U32:
10514 case AMDGPU::S_CMPK_GE_I32:
10515 SrcReg = MI.getOperand(0).getReg();
10516 SrcReg2 = Register();
10517 CmpValue = MI.getOperand(1).getImm();
10518 CmpMask = ~0;
10519 return true;
10520 }
10521
10522 return false;
10523}
10524
10526 Register SrcReg2, int64_t CmpMask,
10527 int64_t CmpValue,
10528 const MachineRegisterInfo *MRI) const {
10529 if (!SrcReg || SrcReg.isPhysical())
10530 return false;
10531
10532 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10533 return false;
10534
10535 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
10536 this](int64_t ExpectedValue, unsigned SrcSize,
10537 bool IsReversible, bool IsSigned) -> bool {
10538 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10539 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10540 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10541 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10542 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
10543 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10544 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10545 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10546 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10547 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
10548 //
10549 // Signed ge/gt are not used for the sign bit.
10550 //
10551 // If result of the AND is unused except in the compare:
10552 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
10553 //
10554 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10555 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10556 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
10557 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10558 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10559 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
10560
10561 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
10562 if (!Def || Def->getParent() != CmpInstr.getParent())
10563 return false;
10564
10565 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
10566 Def->getOpcode() != AMDGPU::S_AND_B64)
10567 return false;
10568
10569 int64_t Mask;
10570 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
10571 if (MO->isImm())
10572 Mask = MO->getImm();
10573 else if (!getFoldableImm(MO, Mask))
10574 return false;
10575 Mask &= maxUIntN(SrcSize);
10576 return isPowerOf2_64(Mask);
10577 };
10578
10579 MachineOperand *SrcOp = &Def->getOperand(1);
10580 if (isMask(SrcOp))
10581 SrcOp = &Def->getOperand(2);
10582 else if (isMask(&Def->getOperand(2)))
10583 SrcOp = &Def->getOperand(1);
10584 else
10585 return false;
10586
10587 // A valid Mask is required to have a single bit set, hence a non-zero and
10588 // power-of-two value. This verifies that we will not do 64-bit shift below.
10589 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
10590 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
10591 if (IsSigned && BitNo == SrcSize - 1)
10592 return false;
10593
10594 ExpectedValue <<= BitNo;
10595
10596 bool IsReversedCC = false;
10597 if (CmpValue != ExpectedValue) {
10598 if (!IsReversible)
10599 return false;
10600 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
10601 if (!IsReversedCC)
10602 return false;
10603 }
10604
10605 Register DefReg = Def->getOperand(0).getReg();
10606 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
10607 return false;
10608
10609 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
10610 I != E; ++I) {
10611 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
10612 I->killsRegister(AMDGPU::SCC, &RI))
10613 return false;
10614 }
10615
10616 MachineOperand *SccDef =
10617 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
10618 SccDef->setIsDead(false);
10619 CmpInstr.eraseFromParent();
10620
10621 if (!MRI->use_nodbg_empty(DefReg)) {
10622 assert(!IsReversedCC);
10623 return true;
10624 }
10625
10626 // Replace AND with unused result with a S_BITCMP.
10627 MachineBasicBlock *MBB = Def->getParent();
10628
10629 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
10630 : AMDGPU::S_BITCMP1_B32
10631 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
10632 : AMDGPU::S_BITCMP1_B64;
10633
10634 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
10635 .add(*SrcOp)
10636 .addImm(BitNo);
10637 Def->eraseFromParent();
10638
10639 return true;
10640 };
10641
10642 switch (CmpInstr.getOpcode()) {
10643 default:
10644 break;
10645 case AMDGPU::S_CMP_EQ_U32:
10646 case AMDGPU::S_CMP_EQ_I32:
10647 case AMDGPU::S_CMPK_EQ_U32:
10648 case AMDGPU::S_CMPK_EQ_I32:
10649 return optimizeCmpAnd(1, 32, true, false);
10650 case AMDGPU::S_CMP_GE_U32:
10651 case AMDGPU::S_CMPK_GE_U32:
10652 return optimizeCmpAnd(1, 32, false, false);
10653 case AMDGPU::S_CMP_GE_I32:
10654 case AMDGPU::S_CMPK_GE_I32:
10655 return optimizeCmpAnd(1, 32, false, true);
10656 case AMDGPU::S_CMP_EQ_U64:
10657 return optimizeCmpAnd(1, 64, true, false);
10658 case AMDGPU::S_CMP_LG_U32:
10659 case AMDGPU::S_CMP_LG_I32:
10660 case AMDGPU::S_CMPK_LG_U32:
10661 case AMDGPU::S_CMPK_LG_I32:
10662 return optimizeCmpAnd(0, 32, true, false);
10663 case AMDGPU::S_CMP_GT_U32:
10664 case AMDGPU::S_CMPK_GT_U32:
10665 return optimizeCmpAnd(0, 32, false, false);
10666 case AMDGPU::S_CMP_GT_I32:
10667 case AMDGPU::S_CMPK_GT_I32:
10668 return optimizeCmpAnd(0, 32, false, true);
10669 case AMDGPU::S_CMP_LG_U64:
10670 return optimizeCmpAnd(0, 64, true, false);
10671 }
10672
10673 return false;
10674}
10675
10677 AMDGPU::OpName OpName) const {
10678 if (!ST.needsAlignedVGPRs())
10679 return;
10680
10681 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
10682 if (OpNo < 0)
10683 return;
10684 MachineOperand &Op = MI.getOperand(OpNo);
10685 if (getOpSize(MI, OpNo) > 4)
10686 return;
10687
10688 // Add implicit aligned super-reg to force alignment on the data operand.
10689 const DebugLoc &DL = MI.getDebugLoc();
10690 MachineBasicBlock *BB = MI.getParent();
10692 Register DataReg = Op.getReg();
10693 bool IsAGPR = RI.isAGPR(MRI, DataReg);
10694 Register Undef = MRI.createVirtualRegister(
10695 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10696 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
10697 Register NewVR =
10698 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10699 : &AMDGPU::VReg_64_Align2RegClass);
10700 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
10701 .addReg(DataReg, 0, Op.getSubReg())
10702 .addImm(AMDGPU::sub0)
10703 .addReg(Undef)
10704 .addImm(AMDGPU::sub1);
10705 Op.setReg(NewVR);
10706 Op.setSubReg(AMDGPU::sub0);
10707 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
10708}
10709
10711 if (isIGLP(*MI))
10712 return false;
10713
10715}
10716
10718 if (!isWMMA(MI) && !isSWMMAC(MI))
10719 return false;
10720
10721 if (AMDGPU::isGFX1250(ST))
10722 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
10723
10724 return true;
10725}
10726
10728 unsigned Opcode = MI.getOpcode();
10729
10730 if (AMDGPU::isGFX12Plus(ST))
10731 return isDOT(MI) || isXDLWMMA(MI);
10732
10733 if (!isMAI(MI) || isDGEMM(Opcode) ||
10734 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
10735 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
10736 return false;
10737
10738 if (!ST.hasGFX940Insts())
10739 return true;
10740
10741 return AMDGPU::getMAIIsGFX940XDL(Opcode);
10742}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MCInstrDesc &TID, unsigned RCID, bool IsAllocatable)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
#define LLVM_DEBUG(...)
Definition Debug.h:119
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition ArrayRef.h:150
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasAddNoCarry() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
bool mayStore() const
Return true if this instruction could possibly modify memory.
bool mayLoad() const
Return true if this instruction could possibly read memory.
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:102
constexpr bool isValid() const
Definition Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const override
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:281
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:194
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:174
self_iterator getIterator()
Definition ilist_node.h:134
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST)
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:231
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:202
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:206
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:201
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:216
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:205
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:212
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:248
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:204
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:242
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:213
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:220
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:203
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:228
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:569
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:571
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:568
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:570
@ TI_CONSTDATA_START
Definition AMDGPU.h:567
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:330
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:262
@ Offset
Definition DWP.cpp:477
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:307
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:216
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2474
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:646
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:551
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
Op::Description Desc
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:157
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:336
auto reverse(ContainerTy &&C)
Definition STLExtras.h:420
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:399
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:257
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:127
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1899
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:577
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
MachineCycleInfo::CycleT MachineCycle
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:219
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:56
MachineInstr * top() const
Definition SIInstrInfo.h:61
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:80
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.