LLVM 22.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(true),
64
66 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
67 AMDGPU::ADJCALLSTACKDOWN),
68 RI(ST), ST(ST) {
69 SchedModel.init(&ST);
70}
71
72//===----------------------------------------------------------------------===//
73// TargetInstrInfo callbacks
74//===----------------------------------------------------------------------===//
75
76static unsigned getNumOperandsNoGlue(SDNode *Node) {
77 unsigned N = Node->getNumOperands();
78 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
79 --N;
80 return N;
81}
82
83/// Returns true if both nodes have the same value for the given
84/// operand \p Op, or if both nodes do not have this operand.
86 AMDGPU::OpName OpName) {
87 unsigned Opc0 = N0->getMachineOpcode();
88 unsigned Opc1 = N1->getMachineOpcode();
89
90 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
91 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
92
93 if (Op0Idx == -1 && Op1Idx == -1)
94 return true;
95
96
97 if ((Op0Idx == -1 && Op1Idx != -1) ||
98 (Op1Idx == -1 && Op0Idx != -1))
99 return false;
100
101 // getNamedOperandIdx returns the index for the MachineInstr's operands,
102 // which includes the result as the first operand. We are indexing into the
103 // MachineSDNode's operands, so we need to skip the result operand to get
104 // the real index.
105 --Op0Idx;
106 --Op1Idx;
107
108 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
109}
110
111static bool canRemat(const MachineInstr &MI) {
112
116 return true;
117
118 if (SIInstrInfo::isSMRD(MI)) {
119 return !MI.memoperands_empty() &&
120 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
121 return MMO->isLoad() && MMO->isInvariant();
122 });
123 }
124
125 return false;
126}
127
129 const MachineInstr &MI) const {
130
131 if (canRemat(MI)) {
132 // Normally VALU use of exec would block the rematerialization, but that
133 // is OK in this case to have an implicit exec read as all VALU do.
134 // We really want all of the generic logic for this except for this.
135
136 // Another potential implicit use is mode register. The core logic of
137 // the RA will not attempt rematerialization if mode is set anywhere
138 // in the function, otherwise it is safe since mode is not changed.
139
140 // There is difference to generic method which does not allow
141 // rematerialization if there are virtual register uses. We allow this,
142 // therefore this method includes SOP instructions as well.
143 if (!MI.hasImplicitDef() &&
144 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
145 !MI.mayRaiseFPException())
146 return true;
147 }
148
150}
151
152// Returns true if the scalar result of a VALU instruction depends on exec.
153bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
154 // Ignore comparisons which are only used masked with exec.
155 // This allows some hoisting/sinking of VALU comparisons.
156 if (MI.isCompare()) {
157 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
158 if (!Dst)
159 return true;
160
161 Register DstReg = Dst->getReg();
162 if (!DstReg.isVirtual())
163 return true;
164
165 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
166 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
167 switch (Use.getOpcode()) {
168 case AMDGPU::S_AND_SAVEEXEC_B32:
169 case AMDGPU::S_AND_SAVEEXEC_B64:
170 break;
171 case AMDGPU::S_AND_B32:
172 case AMDGPU::S_AND_B64:
173 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
174 return true;
175 break;
176 default:
177 return true;
178 }
179 }
180 return false;
181 }
182
183 switch (MI.getOpcode()) {
184 default:
185 break;
186 case AMDGPU::V_READFIRSTLANE_B32:
187 return true;
188 }
189
190 return false;
191}
192
194 // Any implicit use of exec by VALU is not a real register read.
195 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
196 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
197}
198
200 MachineBasicBlock *SuccToSinkTo,
201 MachineCycleInfo *CI) const {
202 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
203 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
204 return true;
205
206 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
207 // Check if sinking of MI would create temporal divergent use.
208 for (auto Op : MI.uses()) {
209 if (Op.isReg() && Op.getReg().isVirtual() &&
210 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
211 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
212
213 // SgprDef defined inside cycle
214 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
215 if (FromCycle == nullptr)
216 continue;
217
218 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
219 // Check if there is a FromCycle that contains SgprDef's basic block but
220 // does not contain SuccToSinkTo and also has divergent exit condition.
221 while (FromCycle && !FromCycle->contains(ToCycle)) {
223 FromCycle->getExitingBlocks(ExitingBlocks);
224
225 // FromCycle has divergent exit condition.
226 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
227 if (hasDivergentBranch(ExitingBlock))
228 return false;
229 }
230
231 FromCycle = FromCycle->getParentCycle();
232 }
233 }
234 }
235
236 return true;
237}
238
240 int64_t &Offset0,
241 int64_t &Offset1) const {
242 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
243 return false;
244
245 unsigned Opc0 = Load0->getMachineOpcode();
246 unsigned Opc1 = Load1->getMachineOpcode();
247
248 // Make sure both are actually loads.
249 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
250 return false;
251
252 // A mayLoad instruction without a def is not a load. Likely a prefetch.
253 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
254 return false;
255
256 if (isDS(Opc0) && isDS(Opc1)) {
257
258 // FIXME: Handle this case:
259 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
260 return false;
261
262 // Check base reg.
263 if (Load0->getOperand(0) != Load1->getOperand(0))
264 return false;
265
266 // Skip read2 / write2 variants for simplicity.
267 // TODO: We should report true if the used offsets are adjacent (excluded
268 // st64 versions).
269 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
270 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
271 if (Offset0Idx == -1 || Offset1Idx == -1)
272 return false;
273
274 // XXX - be careful of dataless loads
275 // getNamedOperandIdx returns the index for MachineInstrs. Since they
276 // include the output in the operand list, but SDNodes don't, we need to
277 // subtract the index by one.
278 Offset0Idx -= get(Opc0).NumDefs;
279 Offset1Idx -= get(Opc1).NumDefs;
280 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
281 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
282 return true;
283 }
284
285 if (isSMRD(Opc0) && isSMRD(Opc1)) {
286 // Skip time and cache invalidation instructions.
287 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
288 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
289 return false;
290
291 unsigned NumOps = getNumOperandsNoGlue(Load0);
292 if (NumOps != getNumOperandsNoGlue(Load1))
293 return false;
294
295 // Check base reg.
296 if (Load0->getOperand(0) != Load1->getOperand(0))
297 return false;
298
299 // Match register offsets, if both register and immediate offsets present.
300 assert(NumOps == 4 || NumOps == 5);
301 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
302 return false;
303
304 const ConstantSDNode *Load0Offset =
306 const ConstantSDNode *Load1Offset =
308
309 if (!Load0Offset || !Load1Offset)
310 return false;
311
312 Offset0 = Load0Offset->getZExtValue();
313 Offset1 = Load1Offset->getZExtValue();
314 return true;
315 }
316
317 // MUBUF and MTBUF can access the same addresses.
318 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
319
320 // MUBUF and MTBUF have vaddr at different indices.
321 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
322 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
323 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
324 return false;
325
326 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
327 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
328
329 if (OffIdx0 == -1 || OffIdx1 == -1)
330 return false;
331
332 // getNamedOperandIdx returns the index for MachineInstrs. Since they
333 // include the output in the operand list, but SDNodes don't, we need to
334 // subtract the index by one.
335 OffIdx0 -= get(Opc0).NumDefs;
336 OffIdx1 -= get(Opc1).NumDefs;
337
338 SDValue Off0 = Load0->getOperand(OffIdx0);
339 SDValue Off1 = Load1->getOperand(OffIdx1);
340
341 // The offset might be a FrameIndexSDNode.
342 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
343 return false;
344
345 Offset0 = Off0->getAsZExtVal();
346 Offset1 = Off1->getAsZExtVal();
347 return true;
348 }
349
350 return false;
351}
352
353static bool isStride64(unsigned Opc) {
354 switch (Opc) {
355 case AMDGPU::DS_READ2ST64_B32:
356 case AMDGPU::DS_READ2ST64_B64:
357 case AMDGPU::DS_WRITE2ST64_B32:
358 case AMDGPU::DS_WRITE2ST64_B64:
359 return true;
360 default:
361 return false;
362 }
363}
364
367 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
368 const TargetRegisterInfo *TRI) const {
369 if (!LdSt.mayLoadOrStore())
370 return false;
371
372 unsigned Opc = LdSt.getOpcode();
373 OffsetIsScalable = false;
374 const MachineOperand *BaseOp, *OffsetOp;
375 int DataOpIdx;
376
377 if (isDS(LdSt)) {
378 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
379 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
380 if (OffsetOp) {
381 // Normal, single offset LDS instruction.
382 if (!BaseOp) {
383 // DS_CONSUME/DS_APPEND use M0 for the base address.
384 // TODO: find the implicit use operand for M0 and use that as BaseOp?
385 return false;
386 }
387 BaseOps.push_back(BaseOp);
388 Offset = OffsetOp->getImm();
389 // Get appropriate operand, and compute width accordingly.
390 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
391 if (DataOpIdx == -1)
392 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
393 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
394 Width = LocationSize::precise(64);
395 else
396 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
397 } else {
398 // The 2 offset instructions use offset0 and offset1 instead. We can treat
399 // these as a load with a single offset if the 2 offsets are consecutive.
400 // We will use this for some partially aligned loads.
401 const MachineOperand *Offset0Op =
402 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
403 const MachineOperand *Offset1Op =
404 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
405
406 unsigned Offset0 = Offset0Op->getImm() & 0xff;
407 unsigned Offset1 = Offset1Op->getImm() & 0xff;
408 if (Offset0 + 1 != Offset1)
409 return false;
410
411 // Each of these offsets is in element sized units, so we need to convert
412 // to bytes of the individual reads.
413
414 unsigned EltSize;
415 if (LdSt.mayLoad())
416 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
417 else {
418 assert(LdSt.mayStore());
419 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
420 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
421 }
422
423 if (isStride64(Opc))
424 EltSize *= 64;
425
426 BaseOps.push_back(BaseOp);
427 Offset = EltSize * Offset0;
428 // Get appropriate operand(s), and compute width accordingly.
429 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
430 if (DataOpIdx == -1) {
431 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
432 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
433 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
434 Width = LocationSize::precise(
435 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
436 } else {
437 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
438 }
439 }
440 return true;
441 }
442
443 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
444 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
445 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
446 return false;
447 BaseOps.push_back(RSrc);
448 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
449 if (BaseOp && !BaseOp->isFI())
450 BaseOps.push_back(BaseOp);
451 const MachineOperand *OffsetImm =
452 getNamedOperand(LdSt, AMDGPU::OpName::offset);
453 Offset = OffsetImm->getImm();
454 const MachineOperand *SOffset =
455 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
456 if (SOffset) {
457 if (SOffset->isReg())
458 BaseOps.push_back(SOffset);
459 else
460 Offset += SOffset->getImm();
461 }
462 // Get appropriate operand, and compute width accordingly.
463 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
464 if (DataOpIdx == -1)
465 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
466 if (DataOpIdx == -1) // LDS DMA
467 return false;
468 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
469 return true;
470 }
471
472 if (isImage(LdSt)) {
473 auto RsrcOpName =
474 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
475 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
476 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
477 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
478 if (VAddr0Idx >= 0) {
479 // GFX10 possible NSA encoding.
480 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
481 BaseOps.push_back(&LdSt.getOperand(I));
482 } else {
483 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
484 }
485 Offset = 0;
486 // Get appropriate operand, and compute width accordingly.
487 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
488 if (DataOpIdx == -1)
489 return false; // no return sampler
490 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
491 return true;
492 }
493
494 if (isSMRD(LdSt)) {
495 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
496 if (!BaseOp) // e.g. S_MEMTIME
497 return false;
498 BaseOps.push_back(BaseOp);
499 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
500 Offset = OffsetOp ? OffsetOp->getImm() : 0;
501 // Get appropriate operand, and compute width accordingly.
502 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
503 if (DataOpIdx == -1)
504 return false;
505 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
506 return true;
507 }
508
509 if (isFLAT(LdSt)) {
510 // Instructions have either vaddr or saddr or both or none.
511 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
512 if (BaseOp)
513 BaseOps.push_back(BaseOp);
514 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
515 if (BaseOp)
516 BaseOps.push_back(BaseOp);
517 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
518 // Get appropriate operand, and compute width accordingly.
519 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
520 if (DataOpIdx == -1)
521 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
522 if (DataOpIdx == -1) // LDS DMA
523 return false;
524 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
525 return true;
526 }
527
528 return false;
529}
530
531static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
533 const MachineInstr &MI2,
535 // Only examine the first "base" operand of each instruction, on the
536 // assumption that it represents the real base address of the memory access.
537 // Other operands are typically offsets or indices from this base address.
538 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
539 return true;
540
541 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
542 return false;
543
544 auto *MO1 = *MI1.memoperands_begin();
545 auto *MO2 = *MI2.memoperands_begin();
546 if (MO1->getAddrSpace() != MO2->getAddrSpace())
547 return false;
548
549 const auto *Base1 = MO1->getValue();
550 const auto *Base2 = MO2->getValue();
551 if (!Base1 || !Base2)
552 return false;
553 Base1 = getUnderlyingObject(Base1);
554 Base2 = getUnderlyingObject(Base2);
555
556 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
557 return false;
558
559 return Base1 == Base2;
560}
561
563 int64_t Offset1, bool OffsetIsScalable1,
565 int64_t Offset2, bool OffsetIsScalable2,
566 unsigned ClusterSize,
567 unsigned NumBytes) const {
568 // If the mem ops (to be clustered) do not have the same base ptr, then they
569 // should not be clustered
570 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
571 if (!BaseOps1.empty() && !BaseOps2.empty()) {
572 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
573 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
574 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
575 return false;
576
577 const SIMachineFunctionInfo *MFI =
578 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
579 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
580 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
581 // If only one base op is empty, they do not have the same base ptr
582 return false;
583 }
584
585 // In order to avoid register pressure, on an average, the number of DWORDS
586 // loaded together by all clustered mem ops should not exceed
587 // MaxMemoryClusterDWords. This is an empirical value based on certain
588 // observations and performance related experiments.
589 // The good thing about this heuristic is - it avoids clustering of too many
590 // sub-word loads, and also avoids clustering of wide loads. Below is the
591 // brief summary of how the heuristic behaves for various `LoadSize` when
592 // MaxMemoryClusterDWords is 8.
593 //
594 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
595 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
596 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
597 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
598 // (5) LoadSize >= 17: do not cluster
599 const unsigned LoadSize = NumBytes / ClusterSize;
600 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
601 return NumDWords <= MaxMemoryClusterDWords;
602}
603
604// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
605// the first 16 loads will be interleaved with the stores, and the next 16 will
606// be clustered as expected. It should really split into 2 16 store batches.
607//
608// Loads are clustered until this returns false, rather than trying to schedule
609// groups of stores. This also means we have to deal with saying different
610// address space loads should be clustered, and ones which might cause bank
611// conflicts.
612//
613// This might be deprecated so it might not be worth that much effort to fix.
615 int64_t Offset0, int64_t Offset1,
616 unsigned NumLoads) const {
617 assert(Offset1 > Offset0 &&
618 "Second offset should be larger than first offset!");
619 // If we have less than 16 loads in a row, and the offsets are within 64
620 // bytes, then schedule together.
621
622 // A cacheline is 64 bytes (for global memory).
623 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
624}
625
628 const DebugLoc &DL, MCRegister DestReg,
629 MCRegister SrcReg, bool KillSrc,
630 const char *Msg = "illegal VGPR to SGPR copy") {
631 MachineFunction *MF = MBB.getParent();
632
634 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
635
636 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
637 .addReg(SrcReg, getKillRegState(KillSrc));
638}
639
640/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
641/// possible to have a direct copy in these cases on GFX908, so an intermediate
642/// VGPR copy is required.
646 const DebugLoc &DL, MCRegister DestReg,
647 MCRegister SrcReg, bool KillSrc,
648 RegScavenger &RS, bool RegsOverlap,
649 Register ImpDefSuperReg = Register(),
650 Register ImpUseSuperReg = Register()) {
651 assert((TII.getSubtarget().hasMAIInsts() &&
652 !TII.getSubtarget().hasGFX90AInsts()) &&
653 "Expected GFX908 subtarget.");
654
655 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
656 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
657 "Source register of the copy should be either an SGPR or an AGPR.");
658
659 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
660 "Destination register of the copy should be an AGPR.");
661
662 const SIRegisterInfo &RI = TII.getRegisterInfo();
663
664 // First try to find defining accvgpr_write to avoid temporary registers.
665 // In the case of copies of overlapping AGPRs, we conservatively do not
666 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
667 // an accvgpr_write used for this same copy due to implicit-defs
668 if (!RegsOverlap) {
669 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
670 --Def;
671
672 if (!Def->modifiesRegister(SrcReg, &RI))
673 continue;
674
675 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
676 Def->getOperand(0).getReg() != SrcReg)
677 break;
678
679 MachineOperand &DefOp = Def->getOperand(1);
680 assert(DefOp.isReg() || DefOp.isImm());
681
682 if (DefOp.isReg()) {
683 bool SafeToPropagate = true;
684 // Check that register source operand is not clobbered before MI.
685 // Immediate operands are always safe to propagate.
686 for (auto I = Def; I != MI && SafeToPropagate; ++I)
687 if (I->modifiesRegister(DefOp.getReg(), &RI))
688 SafeToPropagate = false;
689
690 if (!SafeToPropagate)
691 break;
692
693 for (auto I = Def; I != MI; ++I)
694 I->clearRegisterKills(DefOp.getReg(), &RI);
695 }
696
697 MachineInstrBuilder Builder =
698 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
699 .add(DefOp);
700 if (ImpDefSuperReg)
701 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
702
703 if (ImpUseSuperReg) {
704 Builder.addReg(ImpUseSuperReg,
706 }
707
708 return;
709 }
710 }
711
712 RS.enterBasicBlockEnd(MBB);
713 RS.backward(std::next(MI));
714
715 // Ideally we want to have three registers for a long reg_sequence copy
716 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
717 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
718 *MBB.getParent());
719
720 // Registers in the sequence are allocated contiguously so we can just
721 // use register number to pick one of three round-robin temps.
722 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
723 Register Tmp =
724 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
725 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
726 "VGPR used for an intermediate copy should have been reserved.");
727
728 // Only loop through if there are any free registers left. We don't want to
729 // spill.
730 while (RegNo--) {
731 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
732 /* RestoreAfter */ false, 0,
733 /* AllowSpill */ false);
734 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
735 break;
736 Tmp = Tmp2;
737 RS.setRegUsed(Tmp);
738 }
739
740 // Insert copy to temporary VGPR.
741 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
742 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
743 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
744 } else {
745 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
746 }
747
748 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
749 .addReg(SrcReg, getKillRegState(KillSrc));
750 if (ImpUseSuperReg) {
751 UseBuilder.addReg(ImpUseSuperReg,
753 }
754
755 MachineInstrBuilder DefBuilder
756 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
757 .addReg(Tmp, RegState::Kill);
758
759 if (ImpDefSuperReg)
760 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
761}
762
765 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
766 const TargetRegisterClass *RC, bool Forward) {
767 const SIRegisterInfo &RI = TII.getRegisterInfo();
768 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
770 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
771
772 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
773 int16_t SubIdx = BaseIndices[Idx];
774 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
775 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
776 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
777 unsigned Opcode = AMDGPU::S_MOV_B32;
778
779 // Is SGPR aligned? If so try to combine with next.
780 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
781 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
782 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
783 // Can use SGPR64 copy
784 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
785 SubIdx = RI.getSubRegFromChannel(Channel, 2);
786 DestSubReg = RI.getSubReg(DestReg, SubIdx);
787 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
788 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
789 Opcode = AMDGPU::S_MOV_B64;
790 Idx++;
791 }
792
793 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
794 .addReg(SrcSubReg)
795 .addReg(SrcReg, RegState::Implicit);
796
797 if (!FirstMI)
798 FirstMI = LastMI;
799
800 if (!Forward)
801 I--;
802 }
803
804 assert(FirstMI && LastMI);
805 if (!Forward)
806 std::swap(FirstMI, LastMI);
807
808 FirstMI->addOperand(
809 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
810
811 if (KillSrc)
812 LastMI->addRegisterKilled(SrcReg, &RI);
813}
814
817 const DebugLoc &DL, Register DestReg,
818 Register SrcReg, bool KillSrc, bool RenamableDest,
819 bool RenamableSrc) const {
820 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
821 unsigned Size = RI.getRegSizeInBits(*RC);
822 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
823 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
824
825 // The rest of copyPhysReg assumes Src and Dst size are the same size.
826 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
827 // we remove Fix16BitCopies and this code block?
828 if (Fix16BitCopies) {
829 if (((Size == 16) != (SrcSize == 16))) {
830 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
831 assert(ST.useRealTrue16Insts());
832 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
833 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
834 RegToFix = SubReg;
835
836 if (DestReg == SrcReg) {
837 // Identity copy. Insert empty bundle since ExpandPostRA expects an
838 // instruction here.
839 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
840 return;
841 }
842 RC = RI.getPhysRegBaseClass(DestReg);
843 Size = RI.getRegSizeInBits(*RC);
844 SrcRC = RI.getPhysRegBaseClass(SrcReg);
845 SrcSize = RI.getRegSizeInBits(*SrcRC);
846 }
847 }
848
849 if (RC == &AMDGPU::VGPR_32RegClass) {
850 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
851 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
852 AMDGPU::AGPR_32RegClass.contains(SrcReg));
853 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
854 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
855 BuildMI(MBB, MI, DL, get(Opc), DestReg)
856 .addReg(SrcReg, getKillRegState(KillSrc));
857 return;
858 }
859
860 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
861 RC == &AMDGPU::SReg_32RegClass) {
862 if (SrcReg == AMDGPU::SCC) {
863 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
864 .addImm(1)
865 .addImm(0);
866 return;
867 }
868
869 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
870 if (DestReg == AMDGPU::VCC_LO) {
871 // FIXME: Hack until VReg_1 removed.
872 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
873 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
874 .addImm(0)
875 .addReg(SrcReg, getKillRegState(KillSrc));
876 return;
877 }
878
879 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
880 return;
881 }
882
883 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 return;
886 }
887
888 if (RC == &AMDGPU::SReg_64RegClass) {
889 if (SrcReg == AMDGPU::SCC) {
890 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
891 .addImm(1)
892 .addImm(0);
893 return;
894 }
895
896 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
897 if (DestReg == AMDGPU::VCC) {
898 // FIXME: Hack until VReg_1 removed.
899 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
900 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
901 .addImm(0)
902 .addReg(SrcReg, getKillRegState(KillSrc));
903 return;
904 }
905
906 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
907 return;
908 }
909
910 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
911 .addReg(SrcReg, getKillRegState(KillSrc));
912 return;
913 }
914
915 if (DestReg == AMDGPU::SCC) {
916 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
917 // but SelectionDAG emits such copies for i1 sources.
918 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
919 // This copy can only be produced by patterns
920 // with explicit SCC, which are known to be enabled
921 // only for subtargets with S_CMP_LG_U64 present.
922 assert(ST.hasScalarCompareEq64());
923 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
924 .addReg(SrcReg, getKillRegState(KillSrc))
925 .addImm(0);
926 } else {
927 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
928 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
929 .addReg(SrcReg, getKillRegState(KillSrc))
930 .addImm(0);
931 }
932
933 return;
934 }
935
936 if (RC == &AMDGPU::AGPR_32RegClass) {
937 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
938 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
939 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
940 .addReg(SrcReg, getKillRegState(KillSrc));
941 return;
942 }
943
944 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
945 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
946 .addReg(SrcReg, getKillRegState(KillSrc));
947 return;
948 }
949
950 // FIXME: Pass should maintain scavenger to avoid scan through the block on
951 // every AGPR spill.
952 RegScavenger RS;
953 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
954 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
955 return;
956 }
957
958 if (Size == 16) {
959 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
960 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
961 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
962
963 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
964 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
965 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
966 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
967 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
968 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
969 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
970 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
971
972 if (IsSGPRDst) {
973 if (!IsSGPRSrc) {
974 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
975 return;
976 }
977
978 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
979 .addReg(NewSrcReg, getKillRegState(KillSrc));
980 return;
981 }
982
983 if (IsAGPRDst || IsAGPRSrc) {
984 if (!DstLow || !SrcLow) {
985 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
986 "Cannot use hi16 subreg with an AGPR!");
987 }
988
989 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
990 return;
991 }
992
993 if (ST.useRealTrue16Insts()) {
994 if (IsSGPRSrc) {
995 assert(SrcLow);
996 SrcReg = NewSrcReg;
997 }
998 // Use the smaller instruction encoding if possible.
999 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1000 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1001 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1002 .addReg(SrcReg);
1003 } else {
1004 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1005 .addImm(0) // src0_modifiers
1006 .addReg(SrcReg)
1007 .addImm(0); // op_sel
1008 }
1009 return;
1010 }
1011
1012 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1013 if (!DstLow || !SrcLow) {
1014 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1015 "Cannot use hi16 subreg on VI!");
1016 }
1017
1018 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1019 .addReg(NewSrcReg, getKillRegState(KillSrc));
1020 return;
1021 }
1022
1023 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1024 .addImm(0) // src0_modifiers
1025 .addReg(NewSrcReg)
1026 .addImm(0) // clamp
1033 // First implicit operand is $exec.
1034 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1035 return;
1036 }
1037
1038 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1039 if (ST.hasMovB64()) {
1040 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1041 .addReg(SrcReg, getKillRegState(KillSrc));
1042 return;
1043 }
1044 if (ST.hasPkMovB32()) {
1045 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1047 .addReg(SrcReg)
1049 .addReg(SrcReg)
1050 .addImm(0) // op_sel_lo
1051 .addImm(0) // op_sel_hi
1052 .addImm(0) // neg_lo
1053 .addImm(0) // neg_hi
1054 .addImm(0) // clamp
1055 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1056 return;
1057 }
1058 }
1059
1060 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1061 if (RI.isSGPRClass(RC)) {
1062 if (!RI.isSGPRClass(SrcRC)) {
1063 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1064 return;
1065 }
1066 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1067 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1068 Forward);
1069 return;
1070 }
1071
1072 unsigned EltSize = 4;
1073 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1074 if (RI.isAGPRClass(RC)) {
1075 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1076 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1077 else if (RI.hasVGPRs(SrcRC) ||
1078 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1079 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1080 else
1081 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1082 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1083 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1084 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1085 (RI.isProperlyAlignedRC(*RC) &&
1086 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1087 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1088 if (ST.hasMovB64()) {
1089 Opcode = AMDGPU::V_MOV_B64_e32;
1090 EltSize = 8;
1091 } else if (ST.hasPkMovB32()) {
1092 Opcode = AMDGPU::V_PK_MOV_B32;
1093 EltSize = 8;
1094 }
1095 }
1096
1097 // For the cases where we need an intermediate instruction/temporary register
1098 // (destination is an AGPR), we need a scavenger.
1099 //
1100 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1101 // whole block for every handled copy.
1102 std::unique_ptr<RegScavenger> RS;
1103 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1104 RS = std::make_unique<RegScavenger>();
1105
1106 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1107
1108 // If there is an overlap, we can't kill the super-register on the last
1109 // instruction, since it will also kill the components made live by this def.
1110 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1111 const bool CanKillSuperReg = KillSrc && !Overlap;
1112
1113 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1114 unsigned SubIdx;
1115 if (Forward)
1116 SubIdx = SubIndices[Idx];
1117 else
1118 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1119 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1120 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1121 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1122
1123 bool IsFirstSubreg = Idx == 0;
1124 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1125
1126 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1127 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1128 Register ImpUseSuper = SrcReg;
1129 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1130 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1131 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1133 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1135 .addReg(SrcSubReg)
1137 .addReg(SrcSubReg)
1138 .addImm(0) // op_sel_lo
1139 .addImm(0) // op_sel_hi
1140 .addImm(0) // neg_lo
1141 .addImm(0) // neg_hi
1142 .addImm(0) // clamp
1143 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1144 if (IsFirstSubreg)
1146 } else {
1147 MachineInstrBuilder Builder =
1148 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1149 if (IsFirstSubreg)
1150 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1151
1152 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1153 }
1154 }
1155}
1156
1157int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1158 int NewOpc;
1159
1160 // Try to map original to commuted opcode
1161 NewOpc = AMDGPU::getCommuteRev(Opcode);
1162 if (NewOpc != -1)
1163 // Check if the commuted (REV) opcode exists on the target.
1164 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1165
1166 // Try to map commuted to original opcode
1167 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1168 if (NewOpc != -1)
1169 // Check if the original (non-REV) opcode exists on the target.
1170 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1171
1172 return Opcode;
1173}
1174
1175const TargetRegisterClass *
1177 return &AMDGPU::VGPR_32RegClass;
1178}
1179
1182 const DebugLoc &DL, Register DstReg,
1184 Register TrueReg,
1185 Register FalseReg) const {
1186 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1187 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1189 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1190 "Not a VGPR32 reg");
1191
1192 if (Cond.size() == 1) {
1193 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1194 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1195 .add(Cond[0]);
1196 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1197 .addImm(0)
1198 .addReg(FalseReg)
1199 .addImm(0)
1200 .addReg(TrueReg)
1201 .addReg(SReg);
1202 } else if (Cond.size() == 2) {
1203 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1204 switch (Cond[0].getImm()) {
1205 case SIInstrInfo::SCC_TRUE: {
1206 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1207 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1208 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1209 .addImm(0)
1210 .addReg(FalseReg)
1211 .addImm(0)
1212 .addReg(TrueReg)
1213 .addReg(SReg);
1214 break;
1215 }
1216 case SIInstrInfo::SCC_FALSE: {
1217 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1218 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1219 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1220 .addImm(0)
1221 .addReg(FalseReg)
1222 .addImm(0)
1223 .addReg(TrueReg)
1224 .addReg(SReg);
1225 break;
1226 }
1227 case SIInstrInfo::VCCNZ: {
1228 MachineOperand RegOp = Cond[1];
1229 RegOp.setImplicit(false);
1230 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1231 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1232 .add(RegOp);
1233 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1234 .addImm(0)
1235 .addReg(FalseReg)
1236 .addImm(0)
1237 .addReg(TrueReg)
1238 .addReg(SReg);
1239 break;
1240 }
1241 case SIInstrInfo::VCCZ: {
1242 MachineOperand RegOp = Cond[1];
1243 RegOp.setImplicit(false);
1244 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1245 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1246 .add(RegOp);
1247 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1248 .addImm(0)
1249 .addReg(TrueReg)
1250 .addImm(0)
1251 .addReg(FalseReg)
1252 .addReg(SReg);
1253 break;
1254 }
1255 case SIInstrInfo::EXECNZ: {
1256 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1257 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1258 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1259 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1260 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1261 .addImm(0)
1262 .addReg(FalseReg)
1263 .addImm(0)
1264 .addReg(TrueReg)
1265 .addReg(SReg);
1266 break;
1267 }
1268 case SIInstrInfo::EXECZ: {
1269 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1270 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1271 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1272 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1273 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1274 .addImm(0)
1275 .addReg(FalseReg)
1276 .addImm(0)
1277 .addReg(TrueReg)
1278 .addReg(SReg);
1279 llvm_unreachable("Unhandled branch predicate EXECZ");
1280 break;
1281 }
1282 default:
1283 llvm_unreachable("invalid branch predicate");
1284 }
1285 } else {
1286 llvm_unreachable("Can only handle Cond size 1 or 2");
1287 }
1288}
1289
1292 const DebugLoc &DL,
1293 Register SrcReg, int Value) const {
1294 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1295 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1296 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1297 .addImm(Value)
1298 .addReg(SrcReg);
1299
1300 return Reg;
1301}
1302
1305 const DebugLoc &DL,
1306 Register SrcReg, int Value) const {
1307 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1308 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1309 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1310 .addImm(Value)
1311 .addReg(SrcReg);
1312
1313 return Reg;
1314}
1315
1317 const Register Reg,
1318 int64_t &ImmVal) const {
1319 switch (MI.getOpcode()) {
1320 case AMDGPU::V_MOV_B32_e32:
1321 case AMDGPU::S_MOV_B32:
1322 case AMDGPU::S_MOVK_I32:
1323 case AMDGPU::S_MOV_B64:
1324 case AMDGPU::V_MOV_B64_e32:
1325 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1326 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1327 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1328 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1329 case AMDGPU::V_MOV_B64_PSEUDO: {
1330 const MachineOperand &Src0 = MI.getOperand(1);
1331 if (Src0.isImm()) {
1332 ImmVal = Src0.getImm();
1333 return MI.getOperand(0).getReg() == Reg;
1334 }
1335
1336 return false;
1337 }
1338 case AMDGPU::S_BREV_B32:
1339 case AMDGPU::V_BFREV_B32_e32:
1340 case AMDGPU::V_BFREV_B32_e64: {
1341 const MachineOperand &Src0 = MI.getOperand(1);
1342 if (Src0.isImm()) {
1343 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1344 return MI.getOperand(0).getReg() == Reg;
1345 }
1346
1347 return false;
1348 }
1349 case AMDGPU::S_NOT_B32:
1350 case AMDGPU::V_NOT_B32_e32:
1351 case AMDGPU::V_NOT_B32_e64: {
1352 const MachineOperand &Src0 = MI.getOperand(1);
1353 if (Src0.isImm()) {
1354 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1355 return MI.getOperand(0).getReg() == Reg;
1356 }
1357
1358 return false;
1359 }
1360 default:
1361 return false;
1362 }
1363}
1364
1366
1367 if (RI.isAGPRClass(DstRC))
1368 return AMDGPU::COPY;
1369 if (RI.getRegSizeInBits(*DstRC) == 16) {
1370 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1371 // before RA.
1372 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1373 }
1374 if (RI.getRegSizeInBits(*DstRC) == 32)
1375 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1376 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1377 return AMDGPU::S_MOV_B64;
1378 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1379 return AMDGPU::V_MOV_B64_PSEUDO;
1380 return AMDGPU::COPY;
1381}
1382
1383const MCInstrDesc &
1385 bool IsIndirectSrc) const {
1386 if (IsIndirectSrc) {
1387 if (VecSize <= 32) // 4 bytes
1388 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1389 if (VecSize <= 64) // 8 bytes
1390 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1391 if (VecSize <= 96) // 12 bytes
1392 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1393 if (VecSize <= 128) // 16 bytes
1394 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1395 if (VecSize <= 160) // 20 bytes
1396 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1397 if (VecSize <= 192) // 24 bytes
1398 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1399 if (VecSize <= 224) // 28 bytes
1400 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1401 if (VecSize <= 256) // 32 bytes
1402 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1403 if (VecSize <= 288) // 36 bytes
1404 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1405 if (VecSize <= 320) // 40 bytes
1406 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1407 if (VecSize <= 352) // 44 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1409 if (VecSize <= 384) // 48 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1411 if (VecSize <= 512) // 64 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1413 if (VecSize <= 1024) // 128 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1415
1416 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1417 }
1418
1419 if (VecSize <= 32) // 4 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1421 if (VecSize <= 64) // 8 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1423 if (VecSize <= 96) // 12 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1425 if (VecSize <= 128) // 16 bytes
1426 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1427 if (VecSize <= 160) // 20 bytes
1428 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1429 if (VecSize <= 192) // 24 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1431 if (VecSize <= 224) // 28 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1433 if (VecSize <= 256) // 32 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1435 if (VecSize <= 288) // 36 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1437 if (VecSize <= 320) // 40 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1439 if (VecSize <= 352) // 44 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1441 if (VecSize <= 384) // 48 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1443 if (VecSize <= 512) // 64 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1445 if (VecSize <= 1024) // 128 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1447
1448 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1449}
1450
1451static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1452 if (VecSize <= 32) // 4 bytes
1453 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1454 if (VecSize <= 64) // 8 bytes
1455 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1456 if (VecSize <= 96) // 12 bytes
1457 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1458 if (VecSize <= 128) // 16 bytes
1459 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1460 if (VecSize <= 160) // 20 bytes
1461 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1462 if (VecSize <= 192) // 24 bytes
1463 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1464 if (VecSize <= 224) // 28 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1466 if (VecSize <= 256) // 32 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1468 if (VecSize <= 288) // 36 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1470 if (VecSize <= 320) // 40 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1472 if (VecSize <= 352) // 44 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1474 if (VecSize <= 384) // 48 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1476 if (VecSize <= 512) // 64 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1478 if (VecSize <= 1024) // 128 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1480
1481 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1482}
1483
1484static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1485 if (VecSize <= 32) // 4 bytes
1486 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1487 if (VecSize <= 64) // 8 bytes
1488 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1489 if (VecSize <= 96) // 12 bytes
1490 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1491 if (VecSize <= 128) // 16 bytes
1492 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1493 if (VecSize <= 160) // 20 bytes
1494 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1495 if (VecSize <= 192) // 24 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1497 if (VecSize <= 224) // 28 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1499 if (VecSize <= 256) // 32 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1501 if (VecSize <= 288) // 36 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1503 if (VecSize <= 320) // 40 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1505 if (VecSize <= 352) // 44 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1507 if (VecSize <= 384) // 48 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1509 if (VecSize <= 512) // 64 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1511 if (VecSize <= 1024) // 128 bytes
1512 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1513
1514 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1515}
1516
1517static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1518 if (VecSize <= 64) // 8 bytes
1519 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1520 if (VecSize <= 128) // 16 bytes
1521 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1522 if (VecSize <= 256) // 32 bytes
1523 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1524 if (VecSize <= 512) // 64 bytes
1525 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1526 if (VecSize <= 1024) // 128 bytes
1527 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1528
1529 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1530}
1531
1532const MCInstrDesc &
1533SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1534 bool IsSGPR) const {
1535 if (IsSGPR) {
1536 switch (EltSize) {
1537 case 32:
1538 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1539 case 64:
1540 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1541 default:
1542 llvm_unreachable("invalid reg indexing elt size");
1543 }
1544 }
1545
1546 assert(EltSize == 32 && "invalid reg indexing elt size");
1548}
1549
1550static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1551 switch (Size) {
1552 case 4:
1553 return AMDGPU::SI_SPILL_S32_SAVE;
1554 case 8:
1555 return AMDGPU::SI_SPILL_S64_SAVE;
1556 case 12:
1557 return AMDGPU::SI_SPILL_S96_SAVE;
1558 case 16:
1559 return AMDGPU::SI_SPILL_S128_SAVE;
1560 case 20:
1561 return AMDGPU::SI_SPILL_S160_SAVE;
1562 case 24:
1563 return AMDGPU::SI_SPILL_S192_SAVE;
1564 case 28:
1565 return AMDGPU::SI_SPILL_S224_SAVE;
1566 case 32:
1567 return AMDGPU::SI_SPILL_S256_SAVE;
1568 case 36:
1569 return AMDGPU::SI_SPILL_S288_SAVE;
1570 case 40:
1571 return AMDGPU::SI_SPILL_S320_SAVE;
1572 case 44:
1573 return AMDGPU::SI_SPILL_S352_SAVE;
1574 case 48:
1575 return AMDGPU::SI_SPILL_S384_SAVE;
1576 case 64:
1577 return AMDGPU::SI_SPILL_S512_SAVE;
1578 case 128:
1579 return AMDGPU::SI_SPILL_S1024_SAVE;
1580 default:
1581 llvm_unreachable("unknown register size");
1582 }
1583}
1584
1585static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1586 switch (Size) {
1587 case 2:
1588 return AMDGPU::SI_SPILL_V16_SAVE;
1589 case 4:
1590 return AMDGPU::SI_SPILL_V32_SAVE;
1591 case 8:
1592 return AMDGPU::SI_SPILL_V64_SAVE;
1593 case 12:
1594 return AMDGPU::SI_SPILL_V96_SAVE;
1595 case 16:
1596 return AMDGPU::SI_SPILL_V128_SAVE;
1597 case 20:
1598 return AMDGPU::SI_SPILL_V160_SAVE;
1599 case 24:
1600 return AMDGPU::SI_SPILL_V192_SAVE;
1601 case 28:
1602 return AMDGPU::SI_SPILL_V224_SAVE;
1603 case 32:
1604 return AMDGPU::SI_SPILL_V256_SAVE;
1605 case 36:
1606 return AMDGPU::SI_SPILL_V288_SAVE;
1607 case 40:
1608 return AMDGPU::SI_SPILL_V320_SAVE;
1609 case 44:
1610 return AMDGPU::SI_SPILL_V352_SAVE;
1611 case 48:
1612 return AMDGPU::SI_SPILL_V384_SAVE;
1613 case 64:
1614 return AMDGPU::SI_SPILL_V512_SAVE;
1615 case 128:
1616 return AMDGPU::SI_SPILL_V1024_SAVE;
1617 default:
1618 llvm_unreachable("unknown register size");
1619 }
1620}
1621
1622static unsigned getAVSpillSaveOpcode(unsigned Size) {
1623 switch (Size) {
1624 case 4:
1625 return AMDGPU::SI_SPILL_AV32_SAVE;
1626 case 8:
1627 return AMDGPU::SI_SPILL_AV64_SAVE;
1628 case 12:
1629 return AMDGPU::SI_SPILL_AV96_SAVE;
1630 case 16:
1631 return AMDGPU::SI_SPILL_AV128_SAVE;
1632 case 20:
1633 return AMDGPU::SI_SPILL_AV160_SAVE;
1634 case 24:
1635 return AMDGPU::SI_SPILL_AV192_SAVE;
1636 case 28:
1637 return AMDGPU::SI_SPILL_AV224_SAVE;
1638 case 32:
1639 return AMDGPU::SI_SPILL_AV256_SAVE;
1640 case 36:
1641 return AMDGPU::SI_SPILL_AV288_SAVE;
1642 case 40:
1643 return AMDGPU::SI_SPILL_AV320_SAVE;
1644 case 44:
1645 return AMDGPU::SI_SPILL_AV352_SAVE;
1646 case 48:
1647 return AMDGPU::SI_SPILL_AV384_SAVE;
1648 case 64:
1649 return AMDGPU::SI_SPILL_AV512_SAVE;
1650 case 128:
1651 return AMDGPU::SI_SPILL_AV1024_SAVE;
1652 default:
1653 llvm_unreachable("unknown register size");
1654 }
1655}
1656
1657static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1658 bool IsVectorSuperClass) {
1659 // Currently, there is only 32-bit WWM register spills needed.
1660 if (Size != 4)
1661 llvm_unreachable("unknown wwm register spill size");
1662
1663 if (IsVectorSuperClass)
1664 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1665
1666 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1667}
1668
1670 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1671 const SIMachineFunctionInfo &MFI) const {
1672 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1673
1674 // Choose the right opcode if spilling a WWM register.
1676 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1677
1678 // TODO: Check if AGPRs are available
1679 if (ST.hasMAIInsts())
1680 return getAVSpillSaveOpcode(Size);
1681
1683}
1684
1687 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1688 MachineInstr::MIFlag Flags) const {
1689 MachineFunction *MF = MBB.getParent();
1691 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1692 const DebugLoc &DL = MBB.findDebugLoc(MI);
1693
1694 MachinePointerInfo PtrInfo
1695 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1697 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1698 FrameInfo.getObjectAlign(FrameIndex));
1699 unsigned SpillSize = RI.getSpillSize(*RC);
1700
1702 if (RI.isSGPRClass(RC)) {
1703 MFI->setHasSpilledSGPRs();
1704 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1705 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1706 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1707
1708 // We are only allowed to create one new instruction when spilling
1709 // registers, so we need to use pseudo instruction for spilling SGPRs.
1710 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1711
1712 // The SGPR spill/restore instructions only work on number sgprs, so we need
1713 // to make sure we are using the correct register class.
1714 if (SrcReg.isVirtual() && SpillSize == 4) {
1715 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1716 }
1717
1718 BuildMI(MBB, MI, DL, OpDesc)
1719 .addReg(SrcReg, getKillRegState(isKill)) // data
1720 .addFrameIndex(FrameIndex) // addr
1721 .addMemOperand(MMO)
1723
1724 if (RI.spillSGPRToVGPR())
1725 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1726 return;
1727 }
1728
1729 unsigned Opcode =
1730 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1731 MFI->setHasSpilledVGPRs();
1732
1733 BuildMI(MBB, MI, DL, get(Opcode))
1734 .addReg(SrcReg, getKillRegState(isKill)) // data
1735 .addFrameIndex(FrameIndex) // addr
1736 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1737 .addImm(0) // offset
1738 .addMemOperand(MMO);
1739}
1740
1741static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1742 switch (Size) {
1743 case 4:
1744 return AMDGPU::SI_SPILL_S32_RESTORE;
1745 case 8:
1746 return AMDGPU::SI_SPILL_S64_RESTORE;
1747 case 12:
1748 return AMDGPU::SI_SPILL_S96_RESTORE;
1749 case 16:
1750 return AMDGPU::SI_SPILL_S128_RESTORE;
1751 case 20:
1752 return AMDGPU::SI_SPILL_S160_RESTORE;
1753 case 24:
1754 return AMDGPU::SI_SPILL_S192_RESTORE;
1755 case 28:
1756 return AMDGPU::SI_SPILL_S224_RESTORE;
1757 case 32:
1758 return AMDGPU::SI_SPILL_S256_RESTORE;
1759 case 36:
1760 return AMDGPU::SI_SPILL_S288_RESTORE;
1761 case 40:
1762 return AMDGPU::SI_SPILL_S320_RESTORE;
1763 case 44:
1764 return AMDGPU::SI_SPILL_S352_RESTORE;
1765 case 48:
1766 return AMDGPU::SI_SPILL_S384_RESTORE;
1767 case 64:
1768 return AMDGPU::SI_SPILL_S512_RESTORE;
1769 case 128:
1770 return AMDGPU::SI_SPILL_S1024_RESTORE;
1771 default:
1772 llvm_unreachable("unknown register size");
1773 }
1774}
1775
1776static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1777 switch (Size) {
1778 case 2:
1779 return AMDGPU::SI_SPILL_V16_RESTORE;
1780 case 4:
1781 return AMDGPU::SI_SPILL_V32_RESTORE;
1782 case 8:
1783 return AMDGPU::SI_SPILL_V64_RESTORE;
1784 case 12:
1785 return AMDGPU::SI_SPILL_V96_RESTORE;
1786 case 16:
1787 return AMDGPU::SI_SPILL_V128_RESTORE;
1788 case 20:
1789 return AMDGPU::SI_SPILL_V160_RESTORE;
1790 case 24:
1791 return AMDGPU::SI_SPILL_V192_RESTORE;
1792 case 28:
1793 return AMDGPU::SI_SPILL_V224_RESTORE;
1794 case 32:
1795 return AMDGPU::SI_SPILL_V256_RESTORE;
1796 case 36:
1797 return AMDGPU::SI_SPILL_V288_RESTORE;
1798 case 40:
1799 return AMDGPU::SI_SPILL_V320_RESTORE;
1800 case 44:
1801 return AMDGPU::SI_SPILL_V352_RESTORE;
1802 case 48:
1803 return AMDGPU::SI_SPILL_V384_RESTORE;
1804 case 64:
1805 return AMDGPU::SI_SPILL_V512_RESTORE;
1806 case 128:
1807 return AMDGPU::SI_SPILL_V1024_RESTORE;
1808 default:
1809 llvm_unreachable("unknown register size");
1810 }
1811}
1812
1813static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1814 switch (Size) {
1815 case 4:
1816 return AMDGPU::SI_SPILL_AV32_RESTORE;
1817 case 8:
1818 return AMDGPU::SI_SPILL_AV64_RESTORE;
1819 case 12:
1820 return AMDGPU::SI_SPILL_AV96_RESTORE;
1821 case 16:
1822 return AMDGPU::SI_SPILL_AV128_RESTORE;
1823 case 20:
1824 return AMDGPU::SI_SPILL_AV160_RESTORE;
1825 case 24:
1826 return AMDGPU::SI_SPILL_AV192_RESTORE;
1827 case 28:
1828 return AMDGPU::SI_SPILL_AV224_RESTORE;
1829 case 32:
1830 return AMDGPU::SI_SPILL_AV256_RESTORE;
1831 case 36:
1832 return AMDGPU::SI_SPILL_AV288_RESTORE;
1833 case 40:
1834 return AMDGPU::SI_SPILL_AV320_RESTORE;
1835 case 44:
1836 return AMDGPU::SI_SPILL_AV352_RESTORE;
1837 case 48:
1838 return AMDGPU::SI_SPILL_AV384_RESTORE;
1839 case 64:
1840 return AMDGPU::SI_SPILL_AV512_RESTORE;
1841 case 128:
1842 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1843 default:
1844 llvm_unreachable("unknown register size");
1845 }
1846}
1847
1848static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1849 bool IsVectorSuperClass) {
1850 // Currently, there is only 32-bit WWM register spills needed.
1851 if (Size != 4)
1852 llvm_unreachable("unknown wwm register spill size");
1853
1854 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1855 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1856
1857 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1858}
1859
1861 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1862 const SIMachineFunctionInfo &MFI) const {
1863 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1864
1865 // Choose the right opcode if restoring a WWM register.
1867 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1868
1869 // TODO: Check if AGPRs are available
1870 if (ST.hasMAIInsts())
1872
1873 assert(!RI.isAGPRClass(RC));
1875}
1876
1879 Register DestReg, int FrameIndex,
1880 const TargetRegisterClass *RC,
1881 Register VReg,
1882 MachineInstr::MIFlag Flags) const {
1883 MachineFunction *MF = MBB.getParent();
1885 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1886 const DebugLoc &DL = MBB.findDebugLoc(MI);
1887 unsigned SpillSize = RI.getSpillSize(*RC);
1888
1889 MachinePointerInfo PtrInfo
1890 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1891
1893 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1894 FrameInfo.getObjectAlign(FrameIndex));
1895
1896 if (RI.isSGPRClass(RC)) {
1897 MFI->setHasSpilledSGPRs();
1898 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1899 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1900 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1901
1902 // FIXME: Maybe this should not include a memoperand because it will be
1903 // lowered to non-memory instructions.
1904 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1905 if (DestReg.isVirtual() && SpillSize == 4) {
1907 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1908 }
1909
1910 if (RI.spillSGPRToVGPR())
1911 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1912 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1913 .addFrameIndex(FrameIndex) // addr
1914 .addMemOperand(MMO)
1916
1917 return;
1918 }
1919
1920 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1921 SpillSize, *MFI);
1922 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1923 .addFrameIndex(FrameIndex) // vaddr
1924 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1925 .addImm(0) // offset
1926 .addMemOperand(MMO);
1927}
1928
1933
1936 unsigned Quantity) const {
1937 DebugLoc DL = MBB.findDebugLoc(MI);
1938 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1939 while (Quantity > 0) {
1940 unsigned Arg = std::min(Quantity, MaxSNopCount);
1941 Quantity -= Arg;
1942 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1943 }
1944}
1945
1947 auto *MF = MBB.getParent();
1948 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1949
1950 assert(Info->isEntryFunction());
1951
1952 if (MBB.succ_empty()) {
1953 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1954 if (HasNoTerminator) {
1955 if (Info->returnsVoid()) {
1956 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1957 } else {
1958 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1959 }
1960 }
1961 }
1962}
1963
1967 const DebugLoc &DL) const {
1968 MachineFunction *MF = MBB.getParent();
1969 constexpr unsigned DoorbellIDMask = 0x3ff;
1970 constexpr unsigned ECQueueWaveAbort = 0x400;
1971
1972 MachineBasicBlock *TrapBB = &MBB;
1973 MachineBasicBlock *ContBB = &MBB;
1974 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1975
1976 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1977 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1978 TrapBB = MF->CreateMachineBasicBlock();
1979 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1980 MF->push_back(TrapBB);
1981 MBB.addSuccessor(TrapBB);
1982 } else {
1983 // Since we're adding HaltLoopBB and modifying the CFG, we must return a
1984 // different block to signal the change.
1985 ContBB = HaltLoopBB;
1986 }
1987
1988 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1989 // will be a nop.
1990 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1991 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1992 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1993 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
1994 DoorbellReg)
1996 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
1997 .addUse(AMDGPU::M0);
1998 Register DoorbellRegMasked =
1999 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2000 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2001 .addUse(DoorbellReg)
2002 .addImm(DoorbellIDMask);
2003 Register SetWaveAbortBit =
2004 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2005 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2006 .addUse(DoorbellRegMasked)
2007 .addImm(ECQueueWaveAbort);
2008 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2009 .addUse(SetWaveAbortBit);
2010 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2012 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2013 .addUse(AMDGPU::TTMP2);
2014 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2015 TrapBB->addSuccessor(HaltLoopBB);
2016
2017 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2018 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2019 .addMBB(HaltLoopBB);
2020 MF->push_back(HaltLoopBB);
2021 HaltLoopBB->addSuccessor(HaltLoopBB);
2022
2023 return ContBB;
2024}
2025
2027 switch (MI.getOpcode()) {
2028 default:
2029 if (MI.isMetaInstruction())
2030 return 0;
2031 return 1; // FIXME: Do wait states equal cycles?
2032
2033 case AMDGPU::S_NOP:
2034 return MI.getOperand(0).getImm() + 1;
2035 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2036 // hazard, even if one exist, won't really be visible. Should we handle it?
2037 }
2038}
2039
2041 MachineBasicBlock &MBB = *MI.getParent();
2042 DebugLoc DL = MBB.findDebugLoc(MI);
2044 switch (MI.getOpcode()) {
2045 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2046 case AMDGPU::S_MOV_B64_term:
2047 // This is only a terminator to get the correct spill code placement during
2048 // register allocation.
2049 MI.setDesc(get(AMDGPU::S_MOV_B64));
2050 break;
2051
2052 case AMDGPU::S_MOV_B32_term:
2053 // This is only a terminator to get the correct spill code placement during
2054 // register allocation.
2055 MI.setDesc(get(AMDGPU::S_MOV_B32));
2056 break;
2057
2058 case AMDGPU::S_XOR_B64_term:
2059 // This is only a terminator to get the correct spill code placement during
2060 // register allocation.
2061 MI.setDesc(get(AMDGPU::S_XOR_B64));
2062 break;
2063
2064 case AMDGPU::S_XOR_B32_term:
2065 // This is only a terminator to get the correct spill code placement during
2066 // register allocation.
2067 MI.setDesc(get(AMDGPU::S_XOR_B32));
2068 break;
2069 case AMDGPU::S_OR_B64_term:
2070 // This is only a terminator to get the correct spill code placement during
2071 // register allocation.
2072 MI.setDesc(get(AMDGPU::S_OR_B64));
2073 break;
2074 case AMDGPU::S_OR_B32_term:
2075 // This is only a terminator to get the correct spill code placement during
2076 // register allocation.
2077 MI.setDesc(get(AMDGPU::S_OR_B32));
2078 break;
2079
2080 case AMDGPU::S_ANDN2_B64_term:
2081 // This is only a terminator to get the correct spill code placement during
2082 // register allocation.
2083 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2084 break;
2085
2086 case AMDGPU::S_ANDN2_B32_term:
2087 // This is only a terminator to get the correct spill code placement during
2088 // register allocation.
2089 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2090 break;
2091
2092 case AMDGPU::S_AND_B64_term:
2093 // This is only a terminator to get the correct spill code placement during
2094 // register allocation.
2095 MI.setDesc(get(AMDGPU::S_AND_B64));
2096 break;
2097
2098 case AMDGPU::S_AND_B32_term:
2099 // This is only a terminator to get the correct spill code placement during
2100 // register allocation.
2101 MI.setDesc(get(AMDGPU::S_AND_B32));
2102 break;
2103
2104 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2105 // This is only a terminator to get the correct spill code placement during
2106 // register allocation.
2107 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2108 break;
2109
2110 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2111 // This is only a terminator to get the correct spill code placement during
2112 // register allocation.
2113 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2114 break;
2115
2116 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2117 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2118 break;
2119
2120 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2121 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2122 break;
2123 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2124 Register Dst = MI.getOperand(0).getReg();
2125 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2126 MI.setDesc(
2127 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2128 break;
2129 }
2130 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2131 Register Dst = MI.getOperand(0).getReg();
2132 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2133 int64_t Imm = MI.getOperand(1).getImm();
2134
2135 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2136 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2137 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2140 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2141 .addImm(SignExtend64<32>(Imm >> 32))
2143 MI.eraseFromParent();
2144 break;
2145 }
2146
2147 [[fallthrough]];
2148 }
2149 case AMDGPU::V_MOV_B64_PSEUDO: {
2150 Register Dst = MI.getOperand(0).getReg();
2151 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2152 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2153
2154 const MachineOperand &SrcOp = MI.getOperand(1);
2155 // FIXME: Will this work for 64-bit floating point immediates?
2156 assert(!SrcOp.isFPImm());
2157 if (ST.hasMovB64()) {
2158 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2159 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2160 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2161 break;
2162 }
2163 if (SrcOp.isImm()) {
2164 APInt Imm(64, SrcOp.getImm());
2165 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2166 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2167 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2168 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2170 .addImm(Lo.getSExtValue())
2172 .addImm(Lo.getSExtValue())
2173 .addImm(0) // op_sel_lo
2174 .addImm(0) // op_sel_hi
2175 .addImm(0) // neg_lo
2176 .addImm(0) // neg_hi
2177 .addImm(0); // clamp
2178 } else {
2179 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2180 .addImm(Lo.getSExtValue())
2182 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2183 .addImm(Hi.getSExtValue())
2185 }
2186 } else {
2187 assert(SrcOp.isReg());
2188 if (ST.hasPkMovB32() &&
2189 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2190 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2191 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2192 .addReg(SrcOp.getReg())
2194 .addReg(SrcOp.getReg())
2195 .addImm(0) // op_sel_lo
2196 .addImm(0) // op_sel_hi
2197 .addImm(0) // neg_lo
2198 .addImm(0) // neg_hi
2199 .addImm(0); // clamp
2200 } else {
2201 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2202 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2204 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2205 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2207 }
2208 }
2209 MI.eraseFromParent();
2210 break;
2211 }
2212 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2214 break;
2215 }
2216 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2217 const MachineOperand &SrcOp = MI.getOperand(1);
2218 assert(!SrcOp.isFPImm());
2219
2220 if (ST.has64BitLiterals()) {
2221 MI.setDesc(get(AMDGPU::S_MOV_B64));
2222 break;
2223 }
2224
2225 APInt Imm(64, SrcOp.getImm());
2226 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2227 MI.setDesc(get(AMDGPU::S_MOV_B64));
2228 break;
2229 }
2230
2231 Register Dst = MI.getOperand(0).getReg();
2232 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2233 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2234
2235 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2236 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2237 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2238 .addImm(Lo.getSExtValue())
2240 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2241 .addImm(Hi.getSExtValue())
2243 MI.eraseFromParent();
2244 break;
2245 }
2246 case AMDGPU::V_SET_INACTIVE_B32: {
2247 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2248 Register DstReg = MI.getOperand(0).getReg();
2249 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2250 .add(MI.getOperand(3))
2251 .add(MI.getOperand(4))
2252 .add(MI.getOperand(1))
2253 .add(MI.getOperand(2))
2254 .add(MI.getOperand(5));
2255 MI.eraseFromParent();
2256 break;
2257 }
2258 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2259 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2260 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2261 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2262 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2263 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2264 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2265 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2266 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2267 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2268 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2269 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2270 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2271 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2272 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2273 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2274 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2275 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2276 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2277 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2278 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2279 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2280 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2281 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2282 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2283 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2284 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2285 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2286 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2287 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2288 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2289 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2290 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2291 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2292
2293 unsigned Opc;
2294 if (RI.hasVGPRs(EltRC)) {
2295 Opc = AMDGPU::V_MOVRELD_B32_e32;
2296 } else {
2297 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2298 : AMDGPU::S_MOVRELD_B32;
2299 }
2300
2301 const MCInstrDesc &OpDesc = get(Opc);
2302 Register VecReg = MI.getOperand(0).getReg();
2303 bool IsUndef = MI.getOperand(1).isUndef();
2304 unsigned SubReg = MI.getOperand(3).getImm();
2305 assert(VecReg == MI.getOperand(1).getReg());
2306
2308 BuildMI(MBB, MI, DL, OpDesc)
2309 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2310 .add(MI.getOperand(2))
2312 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2313
2314 const int ImpDefIdx =
2315 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2316 const int ImpUseIdx = ImpDefIdx + 1;
2317 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2318 MI.eraseFromParent();
2319 break;
2320 }
2321 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2322 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2323 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2324 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2325 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2326 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2327 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2328 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2329 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2330 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2331 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2332 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2333 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2334 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2335 assert(ST.useVGPRIndexMode());
2336 Register VecReg = MI.getOperand(0).getReg();
2337 bool IsUndef = MI.getOperand(1).isUndef();
2338 MachineOperand &Idx = MI.getOperand(3);
2339 Register SubReg = MI.getOperand(4).getImm();
2340
2341 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2342 .add(Idx)
2344 SetOn->getOperand(3).setIsUndef();
2345
2346 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2348 BuildMI(MBB, MI, DL, OpDesc)
2349 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2350 .add(MI.getOperand(2))
2352 .addReg(VecReg,
2353 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2354
2355 const int ImpDefIdx =
2356 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2357 const int ImpUseIdx = ImpDefIdx + 1;
2358 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2359
2360 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2361
2362 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2363
2364 MI.eraseFromParent();
2365 break;
2366 }
2367 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2368 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2369 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2370 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2371 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2372 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2373 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2374 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2375 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2376 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2377 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2378 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2379 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2380 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2381 assert(ST.useVGPRIndexMode());
2382 Register Dst = MI.getOperand(0).getReg();
2383 Register VecReg = MI.getOperand(1).getReg();
2384 bool IsUndef = MI.getOperand(1).isUndef();
2385 Register Idx = MI.getOperand(2).getReg();
2386 Register SubReg = MI.getOperand(3).getImm();
2387
2388 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2389 .addReg(Idx)
2391 SetOn->getOperand(3).setIsUndef();
2392
2393 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2394 .addDef(Dst)
2395 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2396 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2397
2398 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2399
2400 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2401
2402 MI.eraseFromParent();
2403 break;
2404 }
2405 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2406 MachineFunction &MF = *MBB.getParent();
2407 Register Reg = MI.getOperand(0).getReg();
2408 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2409 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2410 MachineOperand OpLo = MI.getOperand(1);
2411 MachineOperand OpHi = MI.getOperand(2);
2412
2413 // Create a bundle so these instructions won't be re-ordered by the
2414 // post-RA scheduler.
2415 MIBundleBuilder Bundler(MBB, MI);
2416 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2417
2418 // What we want here is an offset from the value returned by s_getpc (which
2419 // is the address of the s_add_u32 instruction) to the global variable, but
2420 // since the encoding of $symbol starts 4 bytes after the start of the
2421 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2422 // small. This requires us to add 4 to the global variable offset in order
2423 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2424 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2425 // instruction.
2426
2427 int64_t Adjust = 0;
2428 if (ST.hasGetPCZeroExtension()) {
2429 // Fix up hardware that does not sign-extend the 48-bit PC value by
2430 // inserting: s_sext_i32_i16 reghi, reghi
2431 Bundler.append(
2432 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2433 Adjust += 4;
2434 }
2435
2436 if (OpLo.isGlobal())
2437 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2438 Bundler.append(
2439 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2440
2441 if (OpHi.isGlobal())
2442 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2443 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2444 .addReg(RegHi)
2445 .add(OpHi));
2446
2447 finalizeBundle(MBB, Bundler.begin());
2448
2449 MI.eraseFromParent();
2450 break;
2451 }
2452 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2453 MachineFunction &MF = *MBB.getParent();
2454 Register Reg = MI.getOperand(0).getReg();
2455 MachineOperand Op = MI.getOperand(1);
2456
2457 // Create a bundle so these instructions won't be re-ordered by the
2458 // post-RA scheduler.
2459 MIBundleBuilder Bundler(MBB, MI);
2460 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2461 if (Op.isGlobal())
2462 Op.setOffset(Op.getOffset() + 4);
2463 Bundler.append(
2464 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2465
2466 finalizeBundle(MBB, Bundler.begin());
2467
2468 MI.eraseFromParent();
2469 break;
2470 }
2471 case AMDGPU::ENTER_STRICT_WWM: {
2472 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2473 // Whole Wave Mode is entered.
2474 MI.setDesc(get(LMC.OrSaveExecOpc));
2475 break;
2476 }
2477 case AMDGPU::ENTER_STRICT_WQM: {
2478 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2479 // STRICT_WQM is entered.
2480 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2481 .addReg(LMC.ExecReg);
2482 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2483
2484 MI.eraseFromParent();
2485 break;
2486 }
2487 case AMDGPU::EXIT_STRICT_WWM:
2488 case AMDGPU::EXIT_STRICT_WQM: {
2489 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2490 // WWM/STICT_WQM is exited.
2491 MI.setDesc(get(LMC.MovOpc));
2492 break;
2493 }
2494 case AMDGPU::SI_RETURN: {
2495 const MachineFunction *MF = MBB.getParent();
2496 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2497 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2498 // Hiding the return address use with SI_RETURN may lead to extra kills in
2499 // the function and missing live-ins. We are fine in practice because callee
2500 // saved register handling ensures the register value is restored before
2501 // RET, but we need the undef flag here to appease the MachineVerifier
2502 // liveness checks.
2504 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2505 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2506
2507 MIB.copyImplicitOps(MI);
2508 MI.eraseFromParent();
2509 break;
2510 }
2511
2512 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2513 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2514 MI.setDesc(get(AMDGPU::S_MUL_U64));
2515 break;
2516
2517 case AMDGPU::S_GETPC_B64_pseudo:
2518 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2519 if (ST.hasGetPCZeroExtension()) {
2520 Register Dst = MI.getOperand(0).getReg();
2521 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2522 // Fix up hardware that does not sign-extend the 48-bit PC value by
2523 // inserting: s_sext_i32_i16 dsthi, dsthi
2524 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2525 DstHi)
2526 .addReg(DstHi);
2527 }
2528 break;
2529
2530 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2531 assert(ST.hasBF16PackedInsts());
2532 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2533 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2534 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2535 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2536 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2537 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2538 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2539 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2540 break;
2541 }
2542
2543 return true;
2544}
2545
2548 unsigned SubIdx,
2549 const MachineInstr &Orig) const {
2550
2551 // Try shrinking the instruction to remat only the part needed for current
2552 // context.
2553 // TODO: Handle more cases.
2554 unsigned Opcode = Orig.getOpcode();
2555 switch (Opcode) {
2556 case AMDGPU::S_LOAD_DWORDX16_IMM:
2557 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2558 if (SubIdx != 0)
2559 break;
2560
2561 if (I == MBB.end())
2562 break;
2563
2564 if (I->isBundled())
2565 break;
2566
2567 // Look for a single use of the register that is also a subreg.
2568 Register RegToFind = Orig.getOperand(0).getReg();
2569 MachineOperand *UseMO = nullptr;
2570 for (auto &CandMO : I->operands()) {
2571 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2572 continue;
2573 if (UseMO) {
2574 UseMO = nullptr;
2575 break;
2576 }
2577 UseMO = &CandMO;
2578 }
2579 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2580 break;
2581
2582 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2583 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2584
2585 MachineFunction *MF = MBB.getParent();
2587 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2588
2589 unsigned NewOpcode = -1;
2590 if (SubregSize == 256)
2591 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2592 else if (SubregSize == 128)
2593 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2594 else
2595 break;
2596
2597 const MCInstrDesc &TID = get(NewOpcode);
2598 const TargetRegisterClass *NewRC =
2599 RI.getAllocatableClass(getRegClass(TID, 0));
2600 MRI.setRegClass(DestReg, NewRC);
2601
2602 UseMO->setReg(DestReg);
2603 UseMO->setSubReg(AMDGPU::NoSubRegister);
2604
2605 // Use a smaller load with the desired size, possibly with updated offset.
2606 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2607 MI->setDesc(TID);
2608 MI->getOperand(0).setReg(DestReg);
2609 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2610 if (Offset) {
2611 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2612 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2613 OffsetMO->setImm(FinalOffset);
2614 }
2616 for (const MachineMemOperand *MemOp : Orig.memoperands())
2617 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2618 SubregSize / 8));
2619 MI->setMemRefs(*MF, NewMMOs);
2620
2621 MBB.insert(I, MI);
2622 return;
2623 }
2624
2625 default:
2626 break;
2627 }
2628
2629 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig);
2630}
2631
2632std::pair<MachineInstr*, MachineInstr*>
2634 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2635
2636 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2638 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2639 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2640 return std::pair(&MI, nullptr);
2641 }
2642
2643 MachineBasicBlock &MBB = *MI.getParent();
2644 DebugLoc DL = MBB.findDebugLoc(MI);
2645 MachineFunction *MF = MBB.getParent();
2647 Register Dst = MI.getOperand(0).getReg();
2648 unsigned Part = 0;
2649 MachineInstr *Split[2];
2650
2651 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2652 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2653 if (Dst.isPhysical()) {
2654 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2655 } else {
2656 assert(MRI.isSSA());
2657 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2658 MovDPP.addDef(Tmp);
2659 }
2660
2661 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2662 const MachineOperand &SrcOp = MI.getOperand(I);
2663 assert(!SrcOp.isFPImm());
2664 if (SrcOp.isImm()) {
2665 APInt Imm(64, SrcOp.getImm());
2666 Imm.ashrInPlace(Part * 32);
2667 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2668 } else {
2669 assert(SrcOp.isReg());
2670 Register Src = SrcOp.getReg();
2671 if (Src.isPhysical())
2672 MovDPP.addReg(RI.getSubReg(Src, Sub));
2673 else
2674 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2675 }
2676 }
2677
2678 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2679 MovDPP.addImm(MO.getImm());
2680
2681 Split[Part] = MovDPP;
2682 ++Part;
2683 }
2684
2685 if (Dst.isVirtual())
2686 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2687 .addReg(Split[0]->getOperand(0).getReg())
2688 .addImm(AMDGPU::sub0)
2689 .addReg(Split[1]->getOperand(0).getReg())
2690 .addImm(AMDGPU::sub1);
2691
2692 MI.eraseFromParent();
2693 return std::pair(Split[0], Split[1]);
2694}
2695
2696std::optional<DestSourcePair>
2698 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2699 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2700
2701 return std::nullopt;
2702}
2703
2705 AMDGPU::OpName Src0OpName,
2706 MachineOperand &Src1,
2707 AMDGPU::OpName Src1OpName) const {
2708 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2709 if (!Src0Mods)
2710 return false;
2711
2712 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2713 assert(Src1Mods &&
2714 "All commutable instructions have both src0 and src1 modifiers");
2715
2716 int Src0ModsVal = Src0Mods->getImm();
2717 int Src1ModsVal = Src1Mods->getImm();
2718
2719 Src1Mods->setImm(Src0ModsVal);
2720 Src0Mods->setImm(Src1ModsVal);
2721 return true;
2722}
2723
2725 MachineOperand &RegOp,
2726 MachineOperand &NonRegOp) {
2727 Register Reg = RegOp.getReg();
2728 unsigned SubReg = RegOp.getSubReg();
2729 bool IsKill = RegOp.isKill();
2730 bool IsDead = RegOp.isDead();
2731 bool IsUndef = RegOp.isUndef();
2732 bool IsDebug = RegOp.isDebug();
2733
2734 if (NonRegOp.isImm())
2735 RegOp.ChangeToImmediate(NonRegOp.getImm());
2736 else if (NonRegOp.isFI())
2737 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2738 else if (NonRegOp.isGlobal()) {
2739 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2740 NonRegOp.getTargetFlags());
2741 } else
2742 return nullptr;
2743
2744 // Make sure we don't reinterpret a subreg index in the target flags.
2745 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2746
2747 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2748 NonRegOp.setSubReg(SubReg);
2749
2750 return &MI;
2751}
2752
2754 MachineOperand &NonRegOp1,
2755 MachineOperand &NonRegOp2) {
2756 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2757 int64_t NonRegVal = NonRegOp1.getImm();
2758
2759 NonRegOp1.setImm(NonRegOp2.getImm());
2760 NonRegOp2.setImm(NonRegVal);
2761 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2762 NonRegOp2.setTargetFlags(TargetFlags);
2763 return &MI;
2764}
2765
2766bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2767 unsigned OpIdx1) const {
2768 const MCInstrDesc &InstDesc = MI.getDesc();
2769 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2770 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2771
2772 unsigned Opc = MI.getOpcode();
2773 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2774
2775 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2776 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2777
2778 // Swap doesn't breach constant bus or literal limits
2779 // It may move literal to position other than src0, this is not allowed
2780 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2781 // FIXME: After gfx9, literal can be in place other than Src0
2782 if (isVALU(MI)) {
2783 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2784 !isInlineConstant(MO0, OpInfo1))
2785 return false;
2786 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2787 !isInlineConstant(MO1, OpInfo0))
2788 return false;
2789 }
2790
2791 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2792 if (OpInfo1.RegClass == -1)
2793 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2794 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2795 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2796 }
2797 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2798 if (OpInfo0.RegClass == -1)
2799 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2800 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2801 isLegalRegOperand(MI, OpIdx0, MO1);
2802 }
2803
2804 // No need to check 64-bit literals since swapping does not bring new
2805 // 64-bit literals into current instruction to fold to 32-bit
2806
2807 return isImmOperandLegal(MI, OpIdx1, MO0);
2808}
2809
2811 unsigned Src0Idx,
2812 unsigned Src1Idx) const {
2813 assert(!NewMI && "this should never be used");
2814
2815 unsigned Opc = MI.getOpcode();
2816 int CommutedOpcode = commuteOpcode(Opc);
2817 if (CommutedOpcode == -1)
2818 return nullptr;
2819
2820 if (Src0Idx > Src1Idx)
2821 std::swap(Src0Idx, Src1Idx);
2822
2823 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2824 static_cast<int>(Src0Idx) &&
2825 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2826 static_cast<int>(Src1Idx) &&
2827 "inconsistency with findCommutedOpIndices");
2828
2829 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2830 return nullptr;
2831
2832 MachineInstr *CommutedMI = nullptr;
2833 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2834 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2835 if (Src0.isReg() && Src1.isReg()) {
2836 // Be sure to copy the source modifiers to the right place.
2837 CommutedMI =
2838 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2839 } else if (Src0.isReg() && !Src1.isReg()) {
2840 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2841 } else if (!Src0.isReg() && Src1.isReg()) {
2842 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2843 } else if (Src0.isImm() && Src1.isImm()) {
2844 CommutedMI = swapImmOperands(MI, Src0, Src1);
2845 } else {
2846 // FIXME: Found two non registers to commute. This does happen.
2847 return nullptr;
2848 }
2849
2850 if (CommutedMI) {
2851 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2852 Src1, AMDGPU::OpName::src1_modifiers);
2853
2854 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2855 AMDGPU::OpName::src1_sel);
2856
2857 CommutedMI->setDesc(get(CommutedOpcode));
2858 }
2859
2860 return CommutedMI;
2861}
2862
2863// This needs to be implemented because the source modifiers may be inserted
2864// between the true commutable operands, and the base
2865// TargetInstrInfo::commuteInstruction uses it.
2867 unsigned &SrcOpIdx0,
2868 unsigned &SrcOpIdx1) const {
2869 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2870}
2871
2873 unsigned &SrcOpIdx0,
2874 unsigned &SrcOpIdx1) const {
2875 if (!Desc.isCommutable())
2876 return false;
2877
2878 unsigned Opc = Desc.getOpcode();
2879 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2880 if (Src0Idx == -1)
2881 return false;
2882
2883 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2884 if (Src1Idx == -1)
2885 return false;
2886
2887 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2888}
2889
2891 int64_t BrOffset) const {
2892 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2893 // because its dest block is unanalyzable.
2894 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2895
2896 // Convert to dwords.
2897 BrOffset /= 4;
2898
2899 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2900 // from the next instruction.
2901 BrOffset -= 1;
2902
2903 return isIntN(BranchOffsetBits, BrOffset);
2904}
2905
2908 return MI.getOperand(0).getMBB();
2909}
2910
2912 for (const MachineInstr &MI : MBB->terminators()) {
2913 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2914 MI.getOpcode() == AMDGPU::SI_LOOP)
2915 return true;
2916 }
2917 return false;
2918}
2919
2921 MachineBasicBlock &DestBB,
2922 MachineBasicBlock &RestoreBB,
2923 const DebugLoc &DL, int64_t BrOffset,
2924 RegScavenger *RS) const {
2925 assert(MBB.empty() &&
2926 "new block should be inserted for expanding unconditional branch");
2927 assert(MBB.pred_size() == 1);
2928 assert(RestoreBB.empty() &&
2929 "restore block should be inserted for restoring clobbered registers");
2930
2931 MachineFunction *MF = MBB.getParent();
2934 auto I = MBB.end();
2935 auto &MCCtx = MF->getContext();
2936
2937 if (ST.hasAddPC64Inst()) {
2938 MCSymbol *Offset =
2939 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2940 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2942 MCSymbol *PostAddPCLabel =
2943 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2944 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2945 auto *OffsetExpr = MCBinaryExpr::createSub(
2946 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2947 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2948 Offset->setVariableValue(OffsetExpr);
2949 return;
2950 }
2951
2952 assert(RS && "RegScavenger required for long branching");
2953
2954 // FIXME: Virtual register workaround for RegScavenger not working with empty
2955 // blocks.
2956 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2957
2958 // Note: as this is used after hazard recognizer we need to apply some hazard
2959 // workarounds directly.
2960 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2961 ST.hasVALUReadSGPRHazard();
2962 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2963 if (FlushSGPRWrites)
2964 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2966 };
2967
2968 // We need to compute the offset relative to the instruction immediately after
2969 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2970 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2971 ApplyHazardWorkarounds();
2972
2973 MCSymbol *PostGetPCLabel =
2974 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2975 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2976
2977 MCSymbol *OffsetLo =
2978 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2979 MCSymbol *OffsetHi =
2980 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2981 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2982 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2983 .addReg(PCReg, 0, AMDGPU::sub0)
2984 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2985 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2986 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2987 .addReg(PCReg, 0, AMDGPU::sub1)
2988 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2989 ApplyHazardWorkarounds();
2990
2991 // Insert the indirect branch after the other terminator.
2992 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2993 .addReg(PCReg);
2994
2995 // If a spill is needed for the pc register pair, we need to insert a spill
2996 // restore block right before the destination block, and insert a short branch
2997 // into the old destination block's fallthrough predecessor.
2998 // e.g.:
2999 //
3000 // s_cbranch_scc0 skip_long_branch:
3001 //
3002 // long_branch_bb:
3003 // spill s[8:9]
3004 // s_getpc_b64 s[8:9]
3005 // s_add_u32 s8, s8, restore_bb
3006 // s_addc_u32 s9, s9, 0
3007 // s_setpc_b64 s[8:9]
3008 //
3009 // skip_long_branch:
3010 // foo;
3011 //
3012 // .....
3013 //
3014 // dest_bb_fallthrough_predecessor:
3015 // bar;
3016 // s_branch dest_bb
3017 //
3018 // restore_bb:
3019 // restore s[8:9]
3020 // fallthrough dest_bb
3021 ///
3022 // dest_bb:
3023 // buzz;
3024
3025 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3026 Register Scav;
3027
3028 // If we've previously reserved a register for long branches
3029 // avoid running the scavenger and just use those registers
3030 if (LongBranchReservedReg) {
3031 RS->enterBasicBlock(MBB);
3032 Scav = LongBranchReservedReg;
3033 } else {
3034 RS->enterBasicBlockEnd(MBB);
3035 Scav = RS->scavengeRegisterBackwards(
3036 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3037 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3038 }
3039 if (Scav) {
3040 RS->setRegUsed(Scav);
3041 MRI.replaceRegWith(PCReg, Scav);
3042 MRI.clearVirtRegs();
3043 } else {
3044 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3045 // SGPR spill.
3046 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3047 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3048 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3049 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3050 MRI.clearVirtRegs();
3051 }
3052
3053 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3054 // Now, the distance could be defined.
3056 MCSymbolRefExpr::create(DestLabel, MCCtx),
3057 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3058 // Add offset assignments.
3059 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3060 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3061 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3062 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3063}
3064
3065unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3066 switch (Cond) {
3067 case SIInstrInfo::SCC_TRUE:
3068 return AMDGPU::S_CBRANCH_SCC1;
3069 case SIInstrInfo::SCC_FALSE:
3070 return AMDGPU::S_CBRANCH_SCC0;
3071 case SIInstrInfo::VCCNZ:
3072 return AMDGPU::S_CBRANCH_VCCNZ;
3073 case SIInstrInfo::VCCZ:
3074 return AMDGPU::S_CBRANCH_VCCZ;
3075 case SIInstrInfo::EXECNZ:
3076 return AMDGPU::S_CBRANCH_EXECNZ;
3077 case SIInstrInfo::EXECZ:
3078 return AMDGPU::S_CBRANCH_EXECZ;
3079 default:
3080 llvm_unreachable("invalid branch predicate");
3081 }
3082}
3083
3084SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3085 switch (Opcode) {
3086 case AMDGPU::S_CBRANCH_SCC0:
3087 return SCC_FALSE;
3088 case AMDGPU::S_CBRANCH_SCC1:
3089 return SCC_TRUE;
3090 case AMDGPU::S_CBRANCH_VCCNZ:
3091 return VCCNZ;
3092 case AMDGPU::S_CBRANCH_VCCZ:
3093 return VCCZ;
3094 case AMDGPU::S_CBRANCH_EXECNZ:
3095 return EXECNZ;
3096 case AMDGPU::S_CBRANCH_EXECZ:
3097 return EXECZ;
3098 default:
3099 return INVALID_BR;
3100 }
3101}
3102
3106 MachineBasicBlock *&FBB,
3108 bool AllowModify) const {
3109 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3110 // Unconditional Branch
3111 TBB = I->getOperand(0).getMBB();
3112 return false;
3113 }
3114
3115 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3116 if (Pred == INVALID_BR)
3117 return true;
3118
3119 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3120 Cond.push_back(MachineOperand::CreateImm(Pred));
3121 Cond.push_back(I->getOperand(1)); // Save the branch register.
3122
3123 ++I;
3124
3125 if (I == MBB.end()) {
3126 // Conditional branch followed by fall-through.
3127 TBB = CondBB;
3128 return false;
3129 }
3130
3131 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3132 TBB = CondBB;
3133 FBB = I->getOperand(0).getMBB();
3134 return false;
3135 }
3136
3137 return true;
3138}
3139
3141 MachineBasicBlock *&FBB,
3143 bool AllowModify) const {
3144 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3145 auto E = MBB.end();
3146 if (I == E)
3147 return false;
3148
3149 // Skip over the instructions that are artificially terminators for special
3150 // exec management.
3151 while (I != E && !I->isBranch() && !I->isReturn()) {
3152 switch (I->getOpcode()) {
3153 case AMDGPU::S_MOV_B64_term:
3154 case AMDGPU::S_XOR_B64_term:
3155 case AMDGPU::S_OR_B64_term:
3156 case AMDGPU::S_ANDN2_B64_term:
3157 case AMDGPU::S_AND_B64_term:
3158 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3159 case AMDGPU::S_MOV_B32_term:
3160 case AMDGPU::S_XOR_B32_term:
3161 case AMDGPU::S_OR_B32_term:
3162 case AMDGPU::S_ANDN2_B32_term:
3163 case AMDGPU::S_AND_B32_term:
3164 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3165 break;
3166 case AMDGPU::SI_IF:
3167 case AMDGPU::SI_ELSE:
3168 case AMDGPU::SI_KILL_I1_TERMINATOR:
3169 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3170 // FIXME: It's messy that these need to be considered here at all.
3171 return true;
3172 default:
3173 llvm_unreachable("unexpected non-branch terminator inst");
3174 }
3175
3176 ++I;
3177 }
3178
3179 if (I == E)
3180 return false;
3181
3182 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3183}
3184
3186 int *BytesRemoved) const {
3187 unsigned Count = 0;
3188 unsigned RemovedSize = 0;
3189 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3190 // Skip over artificial terminators when removing instructions.
3191 if (MI.isBranch() || MI.isReturn()) {
3192 RemovedSize += getInstSizeInBytes(MI);
3193 MI.eraseFromParent();
3194 ++Count;
3195 }
3196 }
3197
3198 if (BytesRemoved)
3199 *BytesRemoved = RemovedSize;
3200
3201 return Count;
3202}
3203
3204// Copy the flags onto the implicit condition register operand.
3206 const MachineOperand &OrigCond) {
3207 CondReg.setIsUndef(OrigCond.isUndef());
3208 CondReg.setIsKill(OrigCond.isKill());
3209}
3210
3213 MachineBasicBlock *FBB,
3215 const DebugLoc &DL,
3216 int *BytesAdded) const {
3217 if (!FBB && Cond.empty()) {
3218 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3219 .addMBB(TBB);
3220 if (BytesAdded)
3221 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3222 return 1;
3223 }
3224
3225 assert(TBB && Cond[0].isImm());
3226
3227 unsigned Opcode
3228 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3229
3230 if (!FBB) {
3231 MachineInstr *CondBr =
3232 BuildMI(&MBB, DL, get(Opcode))
3233 .addMBB(TBB);
3234
3235 // Copy the flags onto the implicit condition register operand.
3236 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3237 fixImplicitOperands(*CondBr);
3238
3239 if (BytesAdded)
3240 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3241 return 1;
3242 }
3243
3244 assert(TBB && FBB);
3245
3246 MachineInstr *CondBr =
3247 BuildMI(&MBB, DL, get(Opcode))
3248 .addMBB(TBB);
3249 fixImplicitOperands(*CondBr);
3250 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3251 .addMBB(FBB);
3252
3253 MachineOperand &CondReg = CondBr->getOperand(1);
3254 CondReg.setIsUndef(Cond[1].isUndef());
3255 CondReg.setIsKill(Cond[1].isKill());
3256
3257 if (BytesAdded)
3258 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3259
3260 return 2;
3261}
3262
3265 if (Cond.size() != 2) {
3266 return true;
3267 }
3268
3269 if (Cond[0].isImm()) {
3270 Cond[0].setImm(-Cond[0].getImm());
3271 return false;
3272 }
3273
3274 return true;
3275}
3276
3279 Register DstReg, Register TrueReg,
3280 Register FalseReg, int &CondCycles,
3281 int &TrueCycles, int &FalseCycles) const {
3282 switch (Cond[0].getImm()) {
3283 case VCCNZ:
3284 case VCCZ: {
3285 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3286 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3287 if (MRI.getRegClass(FalseReg) != RC)
3288 return false;
3289
3290 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3291 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3292
3293 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3294 return RI.hasVGPRs(RC) && NumInsts <= 6;
3295 }
3296 case SCC_TRUE:
3297 case SCC_FALSE: {
3298 // FIXME: We could insert for VGPRs if we could replace the original compare
3299 // with a vector one.
3300 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3301 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3302 if (MRI.getRegClass(FalseReg) != RC)
3303 return false;
3304
3305 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3306
3307 // Multiples of 8 can do s_cselect_b64
3308 if (NumInsts % 2 == 0)
3309 NumInsts /= 2;
3310
3311 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3312 return RI.isSGPRClass(RC);
3313 }
3314 default:
3315 return false;
3316 }
3317}
3318
3322 Register TrueReg, Register FalseReg) const {
3323 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3324 if (Pred == VCCZ || Pred == SCC_FALSE) {
3325 Pred = static_cast<BranchPredicate>(-Pred);
3326 std::swap(TrueReg, FalseReg);
3327 }
3328
3329 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3330 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3331 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3332
3333 if (DstSize == 32) {
3335 if (Pred == SCC_TRUE) {
3336 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3337 .addReg(TrueReg)
3338 .addReg(FalseReg);
3339 } else {
3340 // Instruction's operands are backwards from what is expected.
3341 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3342 .addReg(FalseReg)
3343 .addReg(TrueReg);
3344 }
3345
3346 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3347 return;
3348 }
3349
3350 if (DstSize == 64 && Pred == SCC_TRUE) {
3352 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3353 .addReg(TrueReg)
3354 .addReg(FalseReg);
3355
3356 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3357 return;
3358 }
3359
3360 static const int16_t Sub0_15[] = {
3361 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3362 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3363 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3364 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3365 };
3366
3367 static const int16_t Sub0_15_64[] = {
3368 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3369 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3370 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3371 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3372 };
3373
3374 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3375 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3376 const int16_t *SubIndices = Sub0_15;
3377 int NElts = DstSize / 32;
3378
3379 // 64-bit select is only available for SALU.
3380 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3381 if (Pred == SCC_TRUE) {
3382 if (NElts % 2) {
3383 SelOp = AMDGPU::S_CSELECT_B32;
3384 EltRC = &AMDGPU::SGPR_32RegClass;
3385 } else {
3386 SelOp = AMDGPU::S_CSELECT_B64;
3387 EltRC = &AMDGPU::SGPR_64RegClass;
3388 SubIndices = Sub0_15_64;
3389 NElts /= 2;
3390 }
3391 }
3392
3394 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3395
3396 I = MIB->getIterator();
3397
3399 for (int Idx = 0; Idx != NElts; ++Idx) {
3400 Register DstElt = MRI.createVirtualRegister(EltRC);
3401 Regs.push_back(DstElt);
3402
3403 unsigned SubIdx = SubIndices[Idx];
3404
3406 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3407 Select =
3408 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3409 .addReg(FalseReg, 0, SubIdx)
3410 .addReg(TrueReg, 0, SubIdx);
3411 } else {
3412 Select =
3413 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3414 .addReg(TrueReg, 0, SubIdx)
3415 .addReg(FalseReg, 0, SubIdx);
3416 }
3417
3418 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3420
3421 MIB.addReg(DstElt)
3422 .addImm(SubIdx);
3423 }
3424}
3425
3427 switch (MI.getOpcode()) {
3428 case AMDGPU::V_MOV_B16_t16_e32:
3429 case AMDGPU::V_MOV_B16_t16_e64:
3430 case AMDGPU::V_MOV_B32_e32:
3431 case AMDGPU::V_MOV_B32_e64:
3432 case AMDGPU::V_MOV_B64_PSEUDO:
3433 case AMDGPU::V_MOV_B64_e32:
3434 case AMDGPU::V_MOV_B64_e64:
3435 case AMDGPU::S_MOV_B32:
3436 case AMDGPU::S_MOV_B64:
3437 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3438 case AMDGPU::COPY:
3439 case AMDGPU::WWM_COPY:
3440 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3441 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3442 case AMDGPU::V_ACCVGPR_MOV_B32:
3443 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3444 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3445 return true;
3446 default:
3447 return false;
3448 }
3449}
3450
3452 switch (MI.getOpcode()) {
3453 case AMDGPU::V_MOV_B16_t16_e32:
3454 case AMDGPU::V_MOV_B16_t16_e64:
3455 return 2;
3456 case AMDGPU::V_MOV_B32_e32:
3457 case AMDGPU::V_MOV_B32_e64:
3458 case AMDGPU::V_MOV_B64_PSEUDO:
3459 case AMDGPU::V_MOV_B64_e32:
3460 case AMDGPU::V_MOV_B64_e64:
3461 case AMDGPU::S_MOV_B32:
3462 case AMDGPU::S_MOV_B64:
3463 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3464 case AMDGPU::COPY:
3465 case AMDGPU::WWM_COPY:
3466 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3467 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3468 case AMDGPU::V_ACCVGPR_MOV_B32:
3469 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3470 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3471 return 1;
3472 default:
3473 llvm_unreachable("MI is not a foldable copy");
3474 }
3475}
3476
3477static constexpr AMDGPU::OpName ModifierOpNames[] = {
3478 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3479 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3480 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3481
3483 unsigned Opc = MI.getOpcode();
3484 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3485 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3486 if (Idx >= 0)
3487 MI.removeOperand(Idx);
3488 }
3489}
3490
3492 const MCInstrDesc &NewDesc) const {
3493 MI.setDesc(NewDesc);
3494
3495 // Remove any leftover implicit operands from mutating the instruction. e.g.
3496 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3497 // anymore.
3498 const MCInstrDesc &Desc = MI.getDesc();
3499 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3500 Desc.implicit_defs().size();
3501
3502 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3503 MI.removeOperand(I);
3504}
3505
3506std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3507 unsigned SubRegIndex) {
3508 switch (SubRegIndex) {
3509 case AMDGPU::NoSubRegister:
3510 return Imm;
3511 case AMDGPU::sub0:
3512 return SignExtend64<32>(Imm);
3513 case AMDGPU::sub1:
3514 return SignExtend64<32>(Imm >> 32);
3515 case AMDGPU::lo16:
3516 return SignExtend64<16>(Imm);
3517 case AMDGPU::hi16:
3518 return SignExtend64<16>(Imm >> 16);
3519 case AMDGPU::sub1_lo16:
3520 return SignExtend64<16>(Imm >> 32);
3521 case AMDGPU::sub1_hi16:
3522 return SignExtend64<16>(Imm >> 48);
3523 default:
3524 return std::nullopt;
3525 }
3526
3527 llvm_unreachable("covered subregister switch");
3528}
3529
3530static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3531 switch (Opc) {
3532 case AMDGPU::V_MAC_F16_e32:
3533 case AMDGPU::V_MAC_F16_e64:
3534 case AMDGPU::V_MAD_F16_e64:
3535 return AMDGPU::V_MADAK_F16;
3536 case AMDGPU::V_MAC_F32_e32:
3537 case AMDGPU::V_MAC_F32_e64:
3538 case AMDGPU::V_MAD_F32_e64:
3539 return AMDGPU::V_MADAK_F32;
3540 case AMDGPU::V_FMAC_F32_e32:
3541 case AMDGPU::V_FMAC_F32_e64:
3542 case AMDGPU::V_FMA_F32_e64:
3543 return AMDGPU::V_FMAAK_F32;
3544 case AMDGPU::V_FMAC_F16_e32:
3545 case AMDGPU::V_FMAC_F16_e64:
3546 case AMDGPU::V_FMAC_F16_t16_e64:
3547 case AMDGPU::V_FMAC_F16_fake16_e64:
3548 case AMDGPU::V_FMA_F16_e64:
3549 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3550 ? AMDGPU::V_FMAAK_F16_t16
3551 : AMDGPU::V_FMAAK_F16_fake16
3552 : AMDGPU::V_FMAAK_F16;
3553 case AMDGPU::V_FMAC_F64_e32:
3554 case AMDGPU::V_FMAC_F64_e64:
3555 case AMDGPU::V_FMA_F64_e64:
3556 return AMDGPU::V_FMAAK_F64;
3557 default:
3558 llvm_unreachable("invalid instruction");
3559 }
3560}
3561
3562static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3563 switch (Opc) {
3564 case AMDGPU::V_MAC_F16_e32:
3565 case AMDGPU::V_MAC_F16_e64:
3566 case AMDGPU::V_MAD_F16_e64:
3567 return AMDGPU::V_MADMK_F16;
3568 case AMDGPU::V_MAC_F32_e32:
3569 case AMDGPU::V_MAC_F32_e64:
3570 case AMDGPU::V_MAD_F32_e64:
3571 return AMDGPU::V_MADMK_F32;
3572 case AMDGPU::V_FMAC_F32_e32:
3573 case AMDGPU::V_FMAC_F32_e64:
3574 case AMDGPU::V_FMA_F32_e64:
3575 return AMDGPU::V_FMAMK_F32;
3576 case AMDGPU::V_FMAC_F16_e32:
3577 case AMDGPU::V_FMAC_F16_e64:
3578 case AMDGPU::V_FMAC_F16_t16_e64:
3579 case AMDGPU::V_FMAC_F16_fake16_e64:
3580 case AMDGPU::V_FMA_F16_e64:
3581 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3582 ? AMDGPU::V_FMAMK_F16_t16
3583 : AMDGPU::V_FMAMK_F16_fake16
3584 : AMDGPU::V_FMAMK_F16;
3585 case AMDGPU::V_FMAC_F64_e32:
3586 case AMDGPU::V_FMAC_F64_e64:
3587 case AMDGPU::V_FMA_F64_e64:
3588 return AMDGPU::V_FMAMK_F64;
3589 default:
3590 llvm_unreachable("invalid instruction");
3591 }
3592}
3593
3595 Register Reg, MachineRegisterInfo *MRI) const {
3596 int64_t Imm;
3597 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3598 return false;
3599
3600 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3601
3602 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3603
3604 unsigned Opc = UseMI.getOpcode();
3605 if (Opc == AMDGPU::COPY) {
3606 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3607
3608 Register DstReg = UseMI.getOperand(0).getReg();
3609 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3610
3611 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3612
3613 if (HasMultipleUses) {
3614 // TODO: This should fold in more cases with multiple use, but we need to
3615 // more carefully consider what those uses are.
3616 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3617
3618 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3619 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3620 return false;
3621
3622 // Most of the time folding a 32-bit inline constant is free (though this
3623 // might not be true if we can't later fold it into a real user).
3624 //
3625 // FIXME: This isInlineConstant check is imprecise if
3626 // getConstValDefinedInReg handled the tricky non-mov cases.
3627 if (ImmDefSize == 32 &&
3629 return false;
3630 }
3631
3632 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3633 RI.getSubRegIdxSize(UseSubReg) == 16;
3634
3635 if (Is16Bit) {
3636 if (RI.hasVGPRs(DstRC))
3637 return false; // Do not clobber vgpr_hi16
3638
3639 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3640 return false;
3641 }
3642
3643 MachineFunction *MF = UseMI.getMF();
3644
3645 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3646 MCRegister MovDstPhysReg =
3647 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3648
3649 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3650
3651 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3652 for (unsigned MovOp :
3653 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3654 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3655 const MCInstrDesc &MovDesc = get(MovOp);
3656
3657 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3658 if (Is16Bit) {
3659 // We just need to find a correctly sized register class, so the
3660 // subregister index compatibility doesn't matter since we're statically
3661 // extracting the immediate value.
3662 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3663 if (!MovDstRC)
3664 continue;
3665
3666 if (MovDstPhysReg) {
3667 // FIXME: We probably should not do this. If there is a live value in
3668 // the high half of the register, it will be corrupted.
3669 MovDstPhysReg =
3670 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3671 if (!MovDstPhysReg)
3672 continue;
3673 }
3674 }
3675
3676 // Result class isn't the right size, try the next instruction.
3677 if (MovDstPhysReg) {
3678 if (!MovDstRC->contains(MovDstPhysReg))
3679 return false;
3680 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3681 // TODO: This will be overly conservative in the case of 16-bit virtual
3682 // SGPRs. We could hack up the virtual register uses to use a compatible
3683 // 32-bit class.
3684 continue;
3685 }
3686
3687 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3688
3689 // Ensure the interpreted immediate value is a valid operand in the new
3690 // mov.
3691 //
3692 // FIXME: isImmOperandLegal should have form that doesn't require existing
3693 // MachineInstr or MachineOperand
3694 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3695 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3696 break;
3697
3698 NewOpc = MovOp;
3699 break;
3700 }
3701
3702 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3703 return false;
3704
3705 if (Is16Bit) {
3706 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3707 if (MovDstPhysReg)
3708 UseMI.getOperand(0).setReg(MovDstPhysReg);
3709 assert(UseMI.getOperand(1).getReg().isVirtual());
3710 }
3711
3712 const MCInstrDesc &NewMCID = get(NewOpc);
3713 UseMI.setDesc(NewMCID);
3714 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3715 UseMI.addImplicitDefUseOperands(*MF);
3716 return true;
3717 }
3718
3719 if (HasMultipleUses)
3720 return false;
3721
3722 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3723 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3724 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3725 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3726 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3727 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3728 Opc == AMDGPU::V_FMAC_F64_e64) {
3729 // Don't fold if we are using source or output modifiers. The new VOP2
3730 // instructions don't have them.
3732 return false;
3733
3734 // If this is a free constant, there's no reason to do this.
3735 // TODO: We could fold this here instead of letting SIFoldOperands do it
3736 // later.
3737 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3738
3739 // Any src operand can be used for the legality check.
3740 if (isInlineConstant(UseMI, Src0Idx, Imm))
3741 return false;
3742
3743 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3744
3745 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3746 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3747
3748 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3749 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3750 (Src1->isReg() && Src1->getReg() == Reg)) {
3751 MachineOperand *RegSrc =
3752 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3753 if (!RegSrc->isReg())
3754 return false;
3755 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3756 ST.getConstantBusLimit(Opc) < 2)
3757 return false;
3758
3759 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3760 return false;
3761
3762 // If src2 is also a literal constant then we have to choose which one to
3763 // fold. In general it is better to choose madak so that the other literal
3764 // can be materialized in an sgpr instead of a vgpr:
3765 // s_mov_b32 s0, literal
3766 // v_madak_f32 v0, s0, v0, literal
3767 // Instead of:
3768 // v_mov_b32 v1, literal
3769 // v_madmk_f32 v0, v0, literal, v1
3770 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3771 if (Def && Def->isMoveImmediate() &&
3772 !isInlineConstant(Def->getOperand(1)))
3773 return false;
3774
3775 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3776 if (pseudoToMCOpcode(NewOpc) == -1)
3777 return false;
3778
3779 // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3780 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3781 // restricting their register classes. For now just bail out.
3782 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3783 NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3784 return false;
3785
3786 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3787 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3788
3789 // FIXME: This would be a lot easier if we could return a new instruction
3790 // instead of having to modify in place.
3791
3792 Register SrcReg = RegSrc->getReg();
3793 unsigned SrcSubReg = RegSrc->getSubReg();
3794 Src0->setReg(SrcReg);
3795 Src0->setSubReg(SrcSubReg);
3796 Src0->setIsKill(RegSrc->isKill());
3797
3798 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3799 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3800 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3801 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3802 UseMI.untieRegOperand(
3803 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3804
3805 Src1->ChangeToImmediate(*SubRegImm);
3806
3808 UseMI.setDesc(get(NewOpc));
3809
3810 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3811 if (DeleteDef)
3812 DefMI.eraseFromParent();
3813
3814 return true;
3815 }
3816
3817 // Added part is the constant: Use v_madak_{f16, f32}.
3818 if (Src2->isReg() && Src2->getReg() == Reg) {
3819 if (ST.getConstantBusLimit(Opc) < 2) {
3820 // Not allowed to use constant bus for another operand.
3821 // We can however allow an inline immediate as src0.
3822 bool Src0Inlined = false;
3823 if (Src0->isReg()) {
3824 // Try to inline constant if possible.
3825 // If the Def moves immediate and the use is single
3826 // We are saving VGPR here.
3827 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3828 if (Def && Def->isMoveImmediate() &&
3829 isInlineConstant(Def->getOperand(1)) &&
3830 MRI->hasOneNonDBGUse(Src0->getReg())) {
3831 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3832 Src0Inlined = true;
3833 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3834 RI.isSGPRReg(*MRI, Src0->getReg())) {
3835 return false;
3836 }
3837 // VGPR is okay as Src0 - fallthrough
3838 }
3839
3840 if (Src1->isReg() && !Src0Inlined) {
3841 // We have one slot for inlinable constant so far - try to fill it
3842 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3843 if (Def && Def->isMoveImmediate() &&
3844 isInlineConstant(Def->getOperand(1)) &&
3845 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3846 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3847 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3848 return false;
3849 // VGPR is okay as Src1 - fallthrough
3850 }
3851 }
3852
3853 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3854 if (pseudoToMCOpcode(NewOpc) == -1)
3855 return false;
3856
3857 // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3858 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3859 // restricting their register classes. For now just bail out.
3860 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3861 NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3862 return false;
3863
3864 // FIXME: This would be a lot easier if we could return a new instruction
3865 // instead of having to modify in place.
3866
3867 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3868 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3869 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3870 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3871 UseMI.untieRegOperand(
3872 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3873
3874 const std::optional<int64_t> SubRegImm =
3875 extractSubregFromImm(Imm, Src2->getSubReg());
3876
3877 // ChangingToImmediate adds Src2 back to the instruction.
3878 Src2->ChangeToImmediate(*SubRegImm);
3879
3880 // These come before src2.
3882 UseMI.setDesc(get(NewOpc));
3883 // It might happen that UseMI was commuted
3884 // and we now have SGPR as SRC1. If so 2 inlined
3885 // constant and SGPR are illegal.
3887
3888 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3889 if (DeleteDef)
3890 DefMI.eraseFromParent();
3891
3892 return true;
3893 }
3894 }
3895
3896 return false;
3897}
3898
3899static bool
3902 if (BaseOps1.size() != BaseOps2.size())
3903 return false;
3904 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3905 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3906 return false;
3907 }
3908 return true;
3909}
3910
3911static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3912 LocationSize WidthB, int OffsetB) {
3913 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3914 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3915 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3916 return LowWidth.hasValue() &&
3917 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3918}
3919
3920bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3921 const MachineInstr &MIb) const {
3922 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3923 int64_t Offset0, Offset1;
3924 LocationSize Dummy0 = LocationSize::precise(0);
3925 LocationSize Dummy1 = LocationSize::precise(0);
3926 bool Offset0IsScalable, Offset1IsScalable;
3927 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3928 Dummy0, &RI) ||
3929 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3930 Dummy1, &RI))
3931 return false;
3932
3933 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3934 return false;
3935
3936 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3937 // FIXME: Handle ds_read2 / ds_write2.
3938 return false;
3939 }
3940 LocationSize Width0 = MIa.memoperands().front()->getSize();
3941 LocationSize Width1 = MIb.memoperands().front()->getSize();
3942 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3943}
3944
3946 const MachineInstr &MIb) const {
3947 assert(MIa.mayLoadOrStore() &&
3948 "MIa must load from or modify a memory location");
3949 assert(MIb.mayLoadOrStore() &&
3950 "MIb must load from or modify a memory location");
3951
3953 return false;
3954
3955 // XXX - Can we relax this between address spaces?
3956 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3957 return false;
3958
3959 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3960 return false;
3961
3962 if (MIa.isBundle() || MIb.isBundle())
3963 return false;
3964
3965 // TODO: Should we check the address space from the MachineMemOperand? That
3966 // would allow us to distinguish objects we know don't alias based on the
3967 // underlying address space, even if it was lowered to a different one,
3968 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3969 // buffer.
3970 if (isDS(MIa)) {
3971 if (isDS(MIb))
3972 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3973
3974 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3975 }
3976
3977 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3978 if (isMUBUF(MIb) || isMTBUF(MIb))
3979 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3980
3981 if (isFLAT(MIb))
3982 return isFLATScratch(MIb);
3983
3984 return !isSMRD(MIb);
3985 }
3986
3987 if (isSMRD(MIa)) {
3988 if (isSMRD(MIb))
3989 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3990
3991 if (isFLAT(MIb))
3992 return isFLATScratch(MIb);
3993
3994 return !isMUBUF(MIb) && !isMTBUF(MIb);
3995 }
3996
3997 if (isFLAT(MIa)) {
3998 if (isFLAT(MIb)) {
3999 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
4000 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
4001 return true;
4002
4003 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4004 }
4005
4006 return false;
4007 }
4008
4009 return false;
4010}
4011
4013 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4014 if (Reg.isPhysical())
4015 return false;
4016 auto *Def = MRI.getUniqueVRegDef(Reg);
4017 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
4018 Imm = Def->getOperand(1).getImm();
4019 if (DefMI)
4020 *DefMI = Def;
4021 return true;
4022 }
4023 return false;
4024}
4025
4026static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4027 MachineInstr **DefMI = nullptr) {
4028 if (!MO->isReg())
4029 return false;
4030 const MachineFunction *MF = MO->getParent()->getMF();
4031 const MachineRegisterInfo &MRI = MF->getRegInfo();
4032 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4033}
4034
4036 MachineInstr &NewMI) {
4037 if (LV) {
4038 unsigned NumOps = MI.getNumOperands();
4039 for (unsigned I = 1; I < NumOps; ++I) {
4040 MachineOperand &Op = MI.getOperand(I);
4041 if (Op.isReg() && Op.isKill())
4042 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4043 }
4044 }
4045}
4046
4047static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4048 switch (Opc) {
4049 case AMDGPU::V_MAC_F16_e32:
4050 case AMDGPU::V_MAC_F16_e64:
4051 return AMDGPU::V_MAD_F16_e64;
4052 case AMDGPU::V_MAC_F32_e32:
4053 case AMDGPU::V_MAC_F32_e64:
4054 return AMDGPU::V_MAD_F32_e64;
4055 case AMDGPU::V_MAC_LEGACY_F32_e32:
4056 case AMDGPU::V_MAC_LEGACY_F32_e64:
4057 return AMDGPU::V_MAD_LEGACY_F32_e64;
4058 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4059 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4060 return AMDGPU::V_FMA_LEGACY_F32_e64;
4061 case AMDGPU::V_FMAC_F16_e32:
4062 case AMDGPU::V_FMAC_F16_e64:
4063 case AMDGPU::V_FMAC_F16_t16_e64:
4064 case AMDGPU::V_FMAC_F16_fake16_e64:
4065 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4066 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4067 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4068 : AMDGPU::V_FMA_F16_gfx9_e64;
4069 case AMDGPU::V_FMAC_F32_e32:
4070 case AMDGPU::V_FMAC_F32_e64:
4071 return AMDGPU::V_FMA_F32_e64;
4072 case AMDGPU::V_FMAC_F64_e32:
4073 case AMDGPU::V_FMAC_F64_e64:
4074 return AMDGPU::V_FMA_F64_e64;
4075 default:
4076 llvm_unreachable("invalid instruction");
4077 }
4078}
4079
4080/// Helper struct for the implementation of 3-address conversion to communicate
4081/// updates made to instruction operands.
4083 /// Other instruction whose def is no longer used by the converted
4084 /// instruction.
4086};
4087
4089 LiveVariables *LV,
4090 LiveIntervals *LIS) const {
4091 MachineBasicBlock &MBB = *MI.getParent();
4092 MachineInstr *CandidateMI = &MI;
4093
4094 if (MI.isBundle()) {
4095 // This is a temporary placeholder for bundle handling that enables us to
4096 // exercise the relevant code paths in the two-address instruction pass.
4097 if (MI.getBundleSize() != 1)
4098 return nullptr;
4099 CandidateMI = MI.getNextNode();
4100 }
4101
4103 MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
4104 if (!NewMI)
4105 return nullptr;
4106
4107 if (MI.isBundle()) {
4108 CandidateMI->eraseFromBundle();
4109
4110 for (MachineOperand &MO : MI.all_defs()) {
4111 if (MO.isTied())
4112 MI.untieRegOperand(MO.getOperandNo());
4113 }
4114 } else {
4115 updateLiveVariables(LV, MI, *NewMI);
4116 if (LIS) {
4117 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4118 // SlotIndex of defs needs to be updated when converting to early-clobber
4119 MachineOperand &Def = NewMI->getOperand(0);
4120 if (Def.isEarlyClobber() && Def.isReg() &&
4121 LIS->hasInterval(Def.getReg())) {
4122 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4123 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4124 auto &LI = LIS->getInterval(Def.getReg());
4125 auto UpdateDefIndex = [&](LiveRange &LR) {
4126 auto *S = LR.find(OldIndex);
4127 if (S != LR.end() && S->start == OldIndex) {
4128 assert(S->valno && S->valno->def == OldIndex);
4129 S->start = NewIndex;
4130 S->valno->def = NewIndex;
4131 }
4132 };
4133 UpdateDefIndex(LI);
4134 for (auto &SR : LI.subranges())
4135 UpdateDefIndex(SR);
4136 }
4137 }
4138 }
4139
4140 if (U.RemoveMIUse) {
4141 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4142 // The only user is the instruction which will be killed.
4143 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4144
4145 if (MRI.hasOneNonDBGUse(DefReg)) {
4146 // We cannot just remove the DefMI here, calling pass will crash.
4147 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4148 U.RemoveMIUse->getOperand(0).setIsDead(true);
4149 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4150 U.RemoveMIUse->removeOperand(I);
4151 if (LV)
4152 LV->getVarInfo(DefReg).AliveBlocks.clear();
4153 }
4154
4155 if (MI.isBundle()) {
4156 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4157 if (!VRI.Reads && !VRI.Writes) {
4158 for (MachineOperand &MO : MI.all_uses()) {
4159 if (MO.isReg() && MO.getReg() == DefReg) {
4160 assert(MO.getSubReg() == 0 &&
4161 "tied sub-registers in bundles currently not supported");
4162 MI.removeOperand(MO.getOperandNo());
4163 break;
4164 }
4165 }
4166
4167 if (LIS)
4168 LIS->shrinkToUses(&LIS->getInterval(DefReg));
4169 }
4170 } else if (LIS) {
4171 LiveInterval &DefLI = LIS->getInterval(DefReg);
4172
4173 // We cannot delete the original instruction here, so hack out the use
4174 // in the original instruction with a dummy register so we can use
4175 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4176 // not have the complexity of deleting a use to consider here.
4177 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4178 for (MachineOperand &MIOp : MI.uses()) {
4179 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4180 MIOp.setIsUndef(true);
4181 MIOp.setReg(DummyReg);
4182 }
4183 }
4184
4185 if (MI.isBundle()) {
4186 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4187 if (!VRI.Reads && !VRI.Writes) {
4188 for (MachineOperand &MIOp : MI.uses()) {
4189 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4190 MIOp.setIsUndef(true);
4191 MIOp.setReg(DummyReg);
4192 }
4193 }
4194 }
4195
4196 MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
4197 false, /*isUndef=*/true));
4198 }
4199
4200 LIS->shrinkToUses(&DefLI);
4201 }
4202 }
4203
4204 return MI.isBundle() ? &MI : NewMI;
4205}
4206
4208SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4209 ThreeAddressUpdates &U) const {
4210 MachineBasicBlock &MBB = *MI.getParent();
4211 unsigned Opc = MI.getOpcode();
4212
4213 // Handle MFMA.
4214 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4215 if (NewMFMAOpc != -1) {
4217 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4218 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4219 MIB.add(MI.getOperand(I));
4220 return MIB;
4221 }
4222
4223 if (SIInstrInfo::isWMMA(MI)) {
4224 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4225 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4226 .setMIFlags(MI.getFlags());
4227 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4228 MIB->addOperand(MI.getOperand(I));
4229 return MIB;
4230 }
4231
4232 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4233 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4234 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4235 "present pre-RA");
4236
4237 // Handle MAC/FMAC.
4238 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4239 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4240 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4241 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4242 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4243 bool Src0Literal = false;
4244
4245 switch (Opc) {
4246 default:
4247 return nullptr;
4248 case AMDGPU::V_MAC_F16_e64:
4249 case AMDGPU::V_FMAC_F16_e64:
4250 case AMDGPU::V_FMAC_F16_t16_e64:
4251 case AMDGPU::V_FMAC_F16_fake16_e64:
4252 case AMDGPU::V_MAC_F32_e64:
4253 case AMDGPU::V_MAC_LEGACY_F32_e64:
4254 case AMDGPU::V_FMAC_F32_e64:
4255 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4256 case AMDGPU::V_FMAC_F64_e64:
4257 break;
4258 case AMDGPU::V_MAC_F16_e32:
4259 case AMDGPU::V_FMAC_F16_e32:
4260 case AMDGPU::V_MAC_F32_e32:
4261 case AMDGPU::V_MAC_LEGACY_F32_e32:
4262 case AMDGPU::V_FMAC_F32_e32:
4263 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4264 case AMDGPU::V_FMAC_F64_e32: {
4265 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4266 AMDGPU::OpName::src0);
4267 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4268 if (!Src0->isReg() && !Src0->isImm())
4269 return nullptr;
4270
4271 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4272 Src0Literal = true;
4273
4274 break;
4275 }
4276 }
4277
4278 MachineInstrBuilder MIB;
4279 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4280 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4281 const MachineOperand *Src0Mods =
4282 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4283 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4284 const MachineOperand *Src1Mods =
4285 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4286 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4287 const MachineOperand *Src2Mods =
4288 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4289 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4290 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4291 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4292
4293 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4294 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4295 // If we have an SGPR input, we will violate the constant bus restriction.
4296 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4297 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4298 MachineInstr *DefMI;
4299
4300 int64_t Imm;
4301 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4302 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4303 if (pseudoToMCOpcode(NewOpc) != -1) {
4304 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4305 .add(*Dst)
4306 .add(*Src0)
4307 .add(*Src1)
4308 .addImm(Imm)
4309 .setMIFlags(MI.getFlags());
4310 U.RemoveMIUse = DefMI;
4311 return MIB;
4312 }
4313 }
4314 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4315 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4316 if (pseudoToMCOpcode(NewOpc) != -1) {
4317 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4318 .add(*Dst)
4319 .add(*Src0)
4320 .addImm(Imm)
4321 .add(*Src2)
4322 .setMIFlags(MI.getFlags());
4323 U.RemoveMIUse = DefMI;
4324 return MIB;
4325 }
4326 }
4327 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4328 if (Src0Literal) {
4329 Imm = Src0->getImm();
4330 DefMI = nullptr;
4331 }
4332 if (pseudoToMCOpcode(NewOpc) != -1 &&
4334 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4335 Src1)) {
4336 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4337 .add(*Dst)
4338 .add(*Src1)
4339 .addImm(Imm)
4340 .add(*Src2)
4341 .setMIFlags(MI.getFlags());
4342 U.RemoveMIUse = DefMI;
4343 return MIB;
4344 }
4345 }
4346 }
4347
4348 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4349 // if VOP3 does not allow a literal operand.
4350 if (Src0Literal && !ST.hasVOP3Literal())
4351 return nullptr;
4352
4353 unsigned NewOpc = getNewFMAInst(ST, Opc);
4354
4355 if (pseudoToMCOpcode(NewOpc) == -1)
4356 return nullptr;
4357
4358 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4359 .add(*Dst)
4360 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4361 .add(*Src0)
4362 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4363 .add(*Src1)
4364 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4365 .add(*Src2)
4366 .addImm(Clamp ? Clamp->getImm() : 0)
4367 .addImm(Omod ? Omod->getImm() : 0)
4368 .setMIFlags(MI.getFlags());
4369 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4370 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4371 return MIB;
4372}
4373
4374// It's not generally safe to move VALU instructions across these since it will
4375// start using the register as a base index rather than directly.
4376// XXX - Why isn't hasSideEffects sufficient for these?
4378 switch (MI.getOpcode()) {
4379 case AMDGPU::S_SET_GPR_IDX_ON:
4380 case AMDGPU::S_SET_GPR_IDX_MODE:
4381 case AMDGPU::S_SET_GPR_IDX_OFF:
4382 return true;
4383 default:
4384 return false;
4385 }
4386}
4387
4389 const MachineBasicBlock *MBB,
4390 const MachineFunction &MF) const {
4391 // Skipping the check for SP writes in the base implementation. The reason it
4392 // was added was apparently due to compile time concerns.
4393 //
4394 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4395 // but is probably avoidable.
4396
4397 // Copied from base implementation.
4398 // Terminators and labels can't be scheduled around.
4399 if (MI.isTerminator() || MI.isPosition())
4400 return true;
4401
4402 // INLINEASM_BR can jump to another block
4403 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4404 return true;
4405
4406 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4407 return true;
4408
4409 // Target-independent instructions do not have an implicit-use of EXEC, even
4410 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4411 // boundaries prevents incorrect movements of such instructions.
4412 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4413 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4414 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4415 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4416 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4418}
4419
4421 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4422 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4423 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4424}
4425
4427 // Instructions that access scratch use FLAT encoding or BUF encodings.
4428 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4429 return false;
4430
4431 // If scratch is not initialized, we can never access it.
4432 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4433 return false;
4434
4435 // SCRATCH instructions always access scratch.
4436 if (isFLATScratch(MI))
4437 return true;
4438
4439 // If there are no memory operands then conservatively assume the flat
4440 // operation may access scratch.
4441 if (MI.memoperands_empty())
4442 return true;
4443
4444 // See if any memory operand specifies an address space that involves scratch.
4445 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4446 unsigned AS = Memop->getAddrSpace();
4447 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4448 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4449 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4450 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4451 }
4452 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4453 });
4454}
4455
4457 assert(isFLAT(MI));
4458
4459 // All flat instructions use the VMEM counter except prefetch.
4460 if (!usesVM_CNT(MI))
4461 return false;
4462
4463 // If there are no memory operands then conservatively assume the flat
4464 // operation may access VMEM.
4465 if (MI.memoperands_empty())
4466 return true;
4467
4468 // See if any memory operand specifies an address space that involves VMEM.
4469 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4470 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4471 // (GDS) address space is not supported by flat operations. Therefore, simply
4472 // return true unless only the LDS address space is found.
4473 for (const MachineMemOperand *Memop : MI.memoperands()) {
4474 unsigned AS = Memop->getAddrSpace();
4476 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4477 return true;
4478 }
4479
4480 return false;
4481}
4482
4484 assert(isFLAT(MI));
4485
4486 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4487 if (!usesLGKM_CNT(MI))
4488 return false;
4489
4490 // If in tgsplit mode then there can be no use of LDS.
4491 if (ST.isTgSplitEnabled())
4492 return false;
4493
4494 // If there are no memory operands then conservatively assume the flat
4495 // operation may access LDS.
4496 if (MI.memoperands_empty())
4497 return true;
4498
4499 // See if any memory operand specifies an address space that involves LDS.
4500 for (const MachineMemOperand *Memop : MI.memoperands()) {
4501 unsigned AS = Memop->getAddrSpace();
4503 return true;
4504 }
4505
4506 return false;
4507}
4508
4510 // Skip the full operand and register alias search modifiesRegister
4511 // does. There's only a handful of instructions that touch this, it's only an
4512 // implicit def, and doesn't alias any other registers.
4513 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4514}
4515
4517 unsigned Opcode = MI.getOpcode();
4518
4519 if (MI.mayStore() && isSMRD(MI))
4520 return true; // scalar store or atomic
4521
4522 // This will terminate the function when other lanes may need to continue.
4523 if (MI.isReturn())
4524 return true;
4525
4526 // These instructions cause shader I/O that may cause hardware lockups
4527 // when executed with an empty EXEC mask.
4528 //
4529 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4530 // EXEC = 0, but checking for that case here seems not worth it
4531 // given the typical code patterns.
4532 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4533 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4534 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4535 return true;
4536
4537 if (MI.isCall() || MI.isInlineAsm())
4538 return true; // conservative assumption
4539
4540 // Assume that barrier interactions are only intended with active lanes.
4541 if (isBarrier(Opcode))
4542 return true;
4543
4544 // A mode change is a scalar operation that influences vector instructions.
4546 return true;
4547
4548 // These are like SALU instructions in terms of effects, so it's questionable
4549 // whether we should return true for those.
4550 //
4551 // However, executing them with EXEC = 0 causes them to operate on undefined
4552 // data, which we avoid by returning true here.
4553 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4554 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4555 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4556 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4557 return true;
4558
4559 return false;
4560}
4561
4563 const MachineInstr &MI) const {
4564 if (MI.isMetaInstruction())
4565 return false;
4566
4567 // This won't read exec if this is an SGPR->SGPR copy.
4568 if (MI.isCopyLike()) {
4569 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4570 return true;
4571
4572 // Make sure this isn't copying exec as a normal operand
4573 return MI.readsRegister(AMDGPU::EXEC, &RI);
4574 }
4575
4576 // Make a conservative assumption about the callee.
4577 if (MI.isCall())
4578 return true;
4579
4580 // Be conservative with any unhandled generic opcodes.
4581 if (!isTargetSpecificOpcode(MI.getOpcode()))
4582 return true;
4583
4584 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4585}
4586
4587bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4588 switch (Imm.getBitWidth()) {
4589 case 1: // This likely will be a condition code mask.
4590 return true;
4591
4592 case 32:
4593 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4594 ST.hasInv2PiInlineImm());
4595 case 64:
4596 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4597 ST.hasInv2PiInlineImm());
4598 case 16:
4599 return ST.has16BitInsts() &&
4600 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4601 ST.hasInv2PiInlineImm());
4602 default:
4603 llvm_unreachable("invalid bitwidth");
4604 }
4605}
4606
4608 APInt IntImm = Imm.bitcastToAPInt();
4609 int64_t IntImmVal = IntImm.getSExtValue();
4610 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4611 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4612 default:
4613 llvm_unreachable("invalid fltSemantics");
4616 return isInlineConstant(IntImm);
4618 return ST.has16BitInsts() &&
4619 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4621 return ST.has16BitInsts() &&
4622 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4623 }
4624}
4625
4626bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4627 // MachineOperand provides no way to tell the true operand size, since it only
4628 // records a 64-bit value. We need to know the size to determine if a 32-bit
4629 // floating point immediate bit pattern is legal for an integer immediate. It
4630 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4631 switch (OperandType) {
4641 int32_t Trunc = static_cast<int32_t>(Imm);
4642 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4643 }
4649 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4652 // We would expect inline immediates to not be concerned with an integer/fp
4653 // distinction. However, in the case of 16-bit integer operations, the
4654 // "floating point" values appear to not work. It seems read the low 16-bits
4655 // of 32-bit immediates, which happens to always work for the integer
4656 // values.
4657 //
4658 // See llvm bugzilla 46302.
4659 //
4660 // TODO: Theoretically we could use op-sel to use the high bits of the
4661 // 32-bit FP values.
4673 return false;
4676 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4677 // A few special case instructions have 16-bit operands on subtargets
4678 // where 16-bit instructions are not legal.
4679 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4680 // constants in these cases
4681 int16_t Trunc = static_cast<int16_t>(Imm);
4682 return ST.has16BitInsts() &&
4683 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4684 }
4685
4686 return false;
4687 }
4690 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4691 int16_t Trunc = static_cast<int16_t>(Imm);
4692 return ST.has16BitInsts() &&
4693 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4694 }
4695 return false;
4696 }
4700 return false;
4702 return isLegalAV64PseudoImm(Imm);
4705 // Always embedded in the instruction for free.
4706 return true;
4716 // Just ignore anything else.
4717 return true;
4718 default:
4719 llvm_unreachable("invalid operand type");
4720 }
4721}
4722
4723static bool compareMachineOp(const MachineOperand &Op0,
4724 const MachineOperand &Op1) {
4725 if (Op0.getType() != Op1.getType())
4726 return false;
4727
4728 switch (Op0.getType()) {
4730 return Op0.getReg() == Op1.getReg();
4732 return Op0.getImm() == Op1.getImm();
4733 default:
4734 llvm_unreachable("Didn't expect to be comparing these operand types");
4735 }
4736}
4737
4739 const MCOperandInfo &OpInfo) const {
4740 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4741 return true;
4742
4743 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4744 return false;
4745
4746 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4747 return true;
4748
4749 return ST.hasVOP3Literal();
4750}
4751
4752bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4753 int64_t ImmVal) const {
4754 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4755 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4756 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4757 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4758 AMDGPU::OpName::src2))
4759 return false;
4760 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4761 }
4762
4763 return isLiteralOperandLegal(InstDesc, OpInfo);
4764}
4765
4766bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4767 const MachineOperand &MO) const {
4768 if (MO.isImm())
4769 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4770
4771 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4772 "unexpected imm-like operand kind");
4773 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4774 return isLiteralOperandLegal(InstDesc, OpInfo);
4775}
4776
4778 // 2 32-bit inline constants packed into one.
4779 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4780 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4781}
4782
4783bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4784 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4785 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4786 return false;
4787
4788 int Op32 = AMDGPU::getVOPe32(Opcode);
4789 if (Op32 == -1)
4790 return false;
4791
4792 return pseudoToMCOpcode(Op32) != -1;
4793}
4794
4795bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4796 // The src0_modifier operand is present on all instructions
4797 // that have modifiers.
4798
4799 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4800}
4801
4803 AMDGPU::OpName OpName) const {
4804 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4805 return Mods && Mods->getImm();
4806}
4807
4809 return any_of(ModifierOpNames,
4810 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4811}
4812
4814 const MachineRegisterInfo &MRI) const {
4815 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4816 // Can't shrink instruction with three operands.
4817 if (Src2) {
4818 switch (MI.getOpcode()) {
4819 default: return false;
4820
4821 case AMDGPU::V_ADDC_U32_e64:
4822 case AMDGPU::V_SUBB_U32_e64:
4823 case AMDGPU::V_SUBBREV_U32_e64: {
4824 const MachineOperand *Src1
4825 = getNamedOperand(MI, AMDGPU::OpName::src1);
4826 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4827 return false;
4828 // Additional verification is needed for sdst/src2.
4829 return true;
4830 }
4831 case AMDGPU::V_MAC_F16_e64:
4832 case AMDGPU::V_MAC_F32_e64:
4833 case AMDGPU::V_MAC_LEGACY_F32_e64:
4834 case AMDGPU::V_FMAC_F16_e64:
4835 case AMDGPU::V_FMAC_F16_t16_e64:
4836 case AMDGPU::V_FMAC_F16_fake16_e64:
4837 case AMDGPU::V_FMAC_F32_e64:
4838 case AMDGPU::V_FMAC_F64_e64:
4839 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4840 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4841 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4842 return false;
4843 break;
4844
4845 case AMDGPU::V_CNDMASK_B32_e64:
4846 break;
4847 }
4848 }
4849
4850 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4851 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4852 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4853 return false;
4854
4855 // We don't need to check src0, all input types are legal, so just make sure
4856 // src0 isn't using any modifiers.
4857 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4858 return false;
4859
4860 // Can it be shrunk to a valid 32 bit opcode?
4861 if (!hasVALU32BitEncoding(MI.getOpcode()))
4862 return false;
4863
4864 // Check output modifiers
4865 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4866 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4867 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4868 // TODO: Can we avoid checking bound_ctrl/fi here?
4869 // They are only used by permlane*_swap special case.
4870 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4871 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4872}
4873
4874// Set VCC operand with all flags from \p Orig, except for setting it as
4875// implicit.
4877 const MachineOperand &Orig) {
4878
4879 for (MachineOperand &Use : MI.implicit_operands()) {
4880 if (Use.isUse() &&
4881 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4882 Use.setIsUndef(Orig.isUndef());
4883 Use.setIsKill(Orig.isKill());
4884 return;
4885 }
4886 }
4887}
4888
4890 unsigned Op32) const {
4891 MachineBasicBlock *MBB = MI.getParent();
4892
4893 const MCInstrDesc &Op32Desc = get(Op32);
4894 MachineInstrBuilder Inst32 =
4895 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4896 .setMIFlags(MI.getFlags());
4897
4898 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4899 // For VOPC instructions, this is replaced by an implicit def of vcc.
4900
4901 // We assume the defs of the shrunk opcode are in the same order, and the
4902 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4903 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4904 Inst32.add(MI.getOperand(I));
4905
4906 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4907
4908 int Idx = MI.getNumExplicitDefs();
4909 for (const MachineOperand &Use : MI.explicit_uses()) {
4910 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4912 continue;
4913
4914 if (&Use == Src2) {
4915 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4916 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4917 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4918 // of vcc was already added during the initial BuildMI, but we
4919 // 1) may need to change vcc to vcc_lo to preserve the original register
4920 // 2) have to preserve the original flags.
4921 copyFlagsToImplicitVCC(*Inst32, *Src2);
4922 continue;
4923 }
4924 }
4925
4926 Inst32.add(Use);
4927 }
4928
4929 // FIXME: Losing implicit operands
4930 fixImplicitOperands(*Inst32);
4931 return Inst32;
4932}
4933
4935 // Null is free
4936 Register Reg = RegOp.getReg();
4937 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4938 return false;
4939
4940 // SGPRs use the constant bus
4941
4942 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
4943 // physical register operands should also count, except for exec.
4944 if (RegOp.isImplicit())
4945 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
4946
4947 // SGPRs use the constant bus
4948 return AMDGPU::SReg_32RegClass.contains(Reg) ||
4949 AMDGPU::SReg_64RegClass.contains(Reg);
4950}
4951
4953 const MachineRegisterInfo &MRI) const {
4954 Register Reg = RegOp.getReg();
4955 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4956 : physRegUsesConstantBus(RegOp);
4957}
4958
4960 const MachineOperand &MO,
4961 const MCOperandInfo &OpInfo) const {
4962 // Literal constants use the constant bus.
4963 if (!MO.isReg())
4964 return !isInlineConstant(MO, OpInfo);
4965
4966 Register Reg = MO.getReg();
4967 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4969}
4970
4972 for (const MachineOperand &MO : MI.implicit_operands()) {
4973 // We only care about reads.
4974 if (MO.isDef())
4975 continue;
4976
4977 switch (MO.getReg()) {
4978 case AMDGPU::VCC:
4979 case AMDGPU::VCC_LO:
4980 case AMDGPU::VCC_HI:
4981 case AMDGPU::M0:
4982 case AMDGPU::FLAT_SCR:
4983 return MO.getReg();
4984
4985 default:
4986 break;
4987 }
4988 }
4989
4990 return Register();
4991}
4992
4993static bool shouldReadExec(const MachineInstr &MI) {
4994 if (SIInstrInfo::isVALU(MI)) {
4995 switch (MI.getOpcode()) {
4996 case AMDGPU::V_READLANE_B32:
4997 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4998 case AMDGPU::V_WRITELANE_B32:
4999 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5000 return false;
5001 }
5002
5003 return true;
5004 }
5005
5006 if (MI.isPreISelOpcode() ||
5007 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
5010 return false;
5011
5012 return true;
5013}
5014
5015static bool isRegOrFI(const MachineOperand &MO) {
5016 return MO.isReg() || MO.isFI();
5017}
5018
5019static bool isSubRegOf(const SIRegisterInfo &TRI,
5020 const MachineOperand &SuperVec,
5021 const MachineOperand &SubReg) {
5022 if (SubReg.getReg().isPhysical())
5023 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
5024
5025 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5026 SubReg.getReg() == SuperVec.getReg();
5027}
5028
5029// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5030bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5031 const MachineRegisterInfo &MRI,
5032 StringRef &ErrInfo) const {
5033 Register DstReg = MI.getOperand(0).getReg();
5034 Register SrcReg = MI.getOperand(1).getReg();
5035 // This is a check for copy from vector register to SGPR
5036 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
5037 ErrInfo = "illegal copy from vector register to SGPR";
5038 return false;
5039 }
5040 return true;
5041}
5042
5044 StringRef &ErrInfo) const {
5045 uint16_t Opcode = MI.getOpcode();
5046 const MachineFunction *MF = MI.getMF();
5047 const MachineRegisterInfo &MRI = MF->getRegInfo();
5048
5049 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5050 // Find a better property to recognize the point where instruction selection
5051 // is just done.
5052 // We can only enforce this check after SIFixSGPRCopies pass so that the
5053 // illegal copies are legalized and thereafter we don't expect a pass
5054 // inserting similar copies.
5055 if (!MRI.isSSA() && MI.isCopy())
5056 return verifyCopy(MI, MRI, ErrInfo);
5057
5058 if (SIInstrInfo::isGenericOpcode(Opcode))
5059 return true;
5060
5061 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
5062 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
5063 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
5064 int Src3Idx = -1;
5065 if (Src0Idx == -1) {
5066 // VOPD V_DUAL_* instructions use different operand names.
5067 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
5068 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
5069 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
5070 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
5071 }
5072
5073 // Make sure the number of operands is correct.
5074 const MCInstrDesc &Desc = get(Opcode);
5075 if (!Desc.isVariadic() &&
5076 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5077 ErrInfo = "Instruction has wrong number of operands.";
5078 return false;
5079 }
5080
5081 if (MI.isInlineAsm()) {
5082 // Verify register classes for inlineasm constraints.
5083 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5084 I != E; ++I) {
5085 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5086 if (!RC)
5087 continue;
5088
5089 const MachineOperand &Op = MI.getOperand(I);
5090 if (!Op.isReg())
5091 continue;
5092
5093 Register Reg = Op.getReg();
5094 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5095 ErrInfo = "inlineasm operand has incorrect register class.";
5096 return false;
5097 }
5098 }
5099
5100 return true;
5101 }
5102
5103 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5104 ErrInfo = "missing memory operand from image instruction.";
5105 return false;
5106 }
5107
5108 // Make sure the register classes are correct.
5109 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5110 const MachineOperand &MO = MI.getOperand(i);
5111 if (MO.isFPImm()) {
5112 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5113 "all fp values to integers.";
5114 return false;
5115 }
5116
5117 const MCOperandInfo &OpInfo = Desc.operands()[i];
5118 int16_t RegClass = getOpRegClassID(OpInfo);
5119
5120 switch (OpInfo.OperandType) {
5122 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5123 ErrInfo = "Illegal immediate value for operand.";
5124 return false;
5125 }
5126 break;
5139 break;
5141 break;
5142 break;
5156 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5157 ErrInfo = "Illegal immediate value for operand.";
5158 return false;
5159 }
5160 break;
5161 }
5163 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5164 ErrInfo = "Expected inline constant for operand.";
5165 return false;
5166 }
5167 break;
5171 break;
5176 // Check if this operand is an immediate.
5177 // FrameIndex operands will be replaced by immediates, so they are
5178 // allowed.
5179 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5180 ErrInfo = "Expected immediate, but got non-immediate";
5181 return false;
5182 }
5183 break;
5187 break;
5188 default:
5189 if (OpInfo.isGenericType())
5190 continue;
5191 break;
5192 }
5193
5194 if (!MO.isReg())
5195 continue;
5196 Register Reg = MO.getReg();
5197 if (!Reg)
5198 continue;
5199
5200 // FIXME: Ideally we would have separate instruction definitions with the
5201 // aligned register constraint.
5202 // FIXME: We do not verify inline asm operands, but custom inline asm
5203 // verification is broken anyway
5204 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
5205 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5206 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5207 if (const TargetRegisterClass *SubRC =
5208 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5209 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5210 if (RC)
5211 RC = SubRC;
5212 }
5213 }
5214
5215 // Check that this is the aligned version of the class.
5216 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5217 ErrInfo = "Subtarget requires even aligned vector registers";
5218 return false;
5219 }
5220 }
5221
5222 if (RegClass != -1) {
5223 if (Reg.isVirtual())
5224 continue;
5225
5226 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5227 if (!RC->contains(Reg)) {
5228 ErrInfo = "Operand has incorrect register class.";
5229 return false;
5230 }
5231 }
5232 }
5233
5234 // Verify SDWA
5235 if (isSDWA(MI)) {
5236 if (!ST.hasSDWA()) {
5237 ErrInfo = "SDWA is not supported on this target";
5238 return false;
5239 }
5240
5241 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5242 AMDGPU::OpName::dst_sel}) {
5243 const MachineOperand *MO = getNamedOperand(MI, Op);
5244 if (!MO)
5245 continue;
5246 int64_t Imm = MO->getImm();
5247 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5248 ErrInfo = "Invalid SDWA selection";
5249 return false;
5250 }
5251 }
5252
5253 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5254
5255 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5256 if (OpIdx == -1)
5257 continue;
5258 const MachineOperand &MO = MI.getOperand(OpIdx);
5259
5260 if (!ST.hasSDWAScalar()) {
5261 // Only VGPRS on VI
5262 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5263 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5264 return false;
5265 }
5266 } else {
5267 // No immediates on GFX9
5268 if (!MO.isReg()) {
5269 ErrInfo =
5270 "Only reg allowed as operands in SDWA instructions on GFX9+";
5271 return false;
5272 }
5273 }
5274 }
5275
5276 if (!ST.hasSDWAOmod()) {
5277 // No omod allowed on VI
5278 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5279 if (OMod != nullptr &&
5280 (!OMod->isImm() || OMod->getImm() != 0)) {
5281 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5282 return false;
5283 }
5284 }
5285
5286 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5287 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5288 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5289 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5290 const MachineOperand *Src0ModsMO =
5291 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5292 unsigned Mods = Src0ModsMO->getImm();
5293 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5294 Mods & SISrcMods::SEXT) {
5295 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5296 return false;
5297 }
5298 }
5299
5300 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5301 if (isVOPC(BasicOpcode)) {
5302 if (!ST.hasSDWASdst() && DstIdx != -1) {
5303 // Only vcc allowed as dst on VI for VOPC
5304 const MachineOperand &Dst = MI.getOperand(DstIdx);
5305 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5306 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5307 return false;
5308 }
5309 } else if (!ST.hasSDWAOutModsVOPC()) {
5310 // No clamp allowed on GFX9 for VOPC
5311 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5312 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5313 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5314 return false;
5315 }
5316
5317 // No omod allowed on GFX9 for VOPC
5318 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5319 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5320 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5321 return false;
5322 }
5323 }
5324 }
5325
5326 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5327 if (DstUnused && DstUnused->isImm() &&
5328 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5329 const MachineOperand &Dst = MI.getOperand(DstIdx);
5330 if (!Dst.isReg() || !Dst.isTied()) {
5331 ErrInfo = "Dst register should have tied register";
5332 return false;
5333 }
5334
5335 const MachineOperand &TiedMO =
5336 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5337 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5338 ErrInfo =
5339 "Dst register should be tied to implicit use of preserved register";
5340 return false;
5341 }
5342 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5343 ErrInfo = "Dst register should use same physical register as preserved";
5344 return false;
5345 }
5346 }
5347 }
5348
5349 // Verify MIMG / VIMAGE / VSAMPLE
5350 if (isImage(Opcode) && !MI.mayStore()) {
5351 // Ensure that the return type used is large enough for all the options
5352 // being used TFE/LWE require an extra result register.
5353 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5354 if (DMask) {
5355 uint64_t DMaskImm = DMask->getImm();
5356 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5357 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5358 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5359 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5360
5361 // Adjust for packed 16 bit values
5362 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5363 RegCount = divideCeil(RegCount, 2);
5364
5365 // Adjust if using LWE or TFE
5366 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5367 RegCount += 1;
5368
5369 const uint32_t DstIdx =
5370 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5371 const MachineOperand &Dst = MI.getOperand(DstIdx);
5372 if (Dst.isReg()) {
5373 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5374 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5375 if (RegCount > DstSize) {
5376 ErrInfo = "Image instruction returns too many registers for dst "
5377 "register class";
5378 return false;
5379 }
5380 }
5381 }
5382 }
5383
5384 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5385 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5386 unsigned ConstantBusCount = 0;
5387 bool UsesLiteral = false;
5388 const MachineOperand *LiteralVal = nullptr;
5389
5390 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5391 if (ImmIdx != -1) {
5392 ++ConstantBusCount;
5393 UsesLiteral = true;
5394 LiteralVal = &MI.getOperand(ImmIdx);
5395 }
5396
5397 SmallVector<Register, 2> SGPRsUsed;
5398 Register SGPRUsed;
5399
5400 // Only look at the true operands. Only a real operand can use the constant
5401 // bus, and we don't want to check pseudo-operands like the source modifier
5402 // flags.
5403 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5404 if (OpIdx == -1)
5405 continue;
5406 const MachineOperand &MO = MI.getOperand(OpIdx);
5407 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5408 if (MO.isReg()) {
5409 SGPRUsed = MO.getReg();
5410 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5411 ++ConstantBusCount;
5412 SGPRsUsed.push_back(SGPRUsed);
5413 }
5414 } else if (!MO.isFI()) { // Treat FI like a register.
5415 if (!UsesLiteral) {
5416 ++ConstantBusCount;
5417 UsesLiteral = true;
5418 LiteralVal = &MO;
5419 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5420 assert(isVOP2(MI) || isVOP3(MI));
5421 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5422 return false;
5423 }
5424 }
5425 }
5426 }
5427
5428 SGPRUsed = findImplicitSGPRRead(MI);
5429 if (SGPRUsed) {
5430 // Implicit uses may safely overlap true operands
5431 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5432 return !RI.regsOverlap(SGPRUsed, SGPR);
5433 })) {
5434 ++ConstantBusCount;
5435 SGPRsUsed.push_back(SGPRUsed);
5436 }
5437 }
5438
5439 // v_writelane_b32 is an exception from constant bus restriction:
5440 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5441 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5442 Opcode != AMDGPU::V_WRITELANE_B32) {
5443 ErrInfo = "VOP* instruction violates constant bus restriction";
5444 return false;
5445 }
5446
5447 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5448 ErrInfo = "VOP3 instruction uses literal";
5449 return false;
5450 }
5451 }
5452
5453 // Special case for writelane - this can break the multiple constant bus rule,
5454 // but still can't use more than one SGPR register
5455 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5456 unsigned SGPRCount = 0;
5457 Register SGPRUsed;
5458
5459 for (int OpIdx : {Src0Idx, Src1Idx}) {
5460 if (OpIdx == -1)
5461 break;
5462
5463 const MachineOperand &MO = MI.getOperand(OpIdx);
5464
5465 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5466 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5467 if (MO.getReg() != SGPRUsed)
5468 ++SGPRCount;
5469 SGPRUsed = MO.getReg();
5470 }
5471 }
5472 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5473 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5474 return false;
5475 }
5476 }
5477 }
5478
5479 // Verify misc. restrictions on specific instructions.
5480 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5481 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5482 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5483 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5484 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5485 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5486 if (!compareMachineOp(Src0, Src1) &&
5487 !compareMachineOp(Src0, Src2)) {
5488 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5489 return false;
5490 }
5491 }
5492 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5493 SISrcMods::ABS) ||
5494 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5495 SISrcMods::ABS) ||
5496 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5497 SISrcMods::ABS)) {
5498 ErrInfo = "ABS not allowed in VOP3B instructions";
5499 return false;
5500 }
5501 }
5502
5503 if (isSOP2(MI) || isSOPC(MI)) {
5504 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5505 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5506
5507 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5508 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5509 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5510 !Src0.isIdenticalTo(Src1)) {
5511 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5512 return false;
5513 }
5514 }
5515
5516 if (isSOPK(MI)) {
5517 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5518 if (Desc.isBranch()) {
5519 if (!Op->isMBB()) {
5520 ErrInfo = "invalid branch target for SOPK instruction";
5521 return false;
5522 }
5523 } else {
5524 uint64_t Imm = Op->getImm();
5525 if (sopkIsZext(Opcode)) {
5526 if (!isUInt<16>(Imm)) {
5527 ErrInfo = "invalid immediate for SOPK instruction";
5528 return false;
5529 }
5530 } else {
5531 if (!isInt<16>(Imm)) {
5532 ErrInfo = "invalid immediate for SOPK instruction";
5533 return false;
5534 }
5535 }
5536 }
5537 }
5538
5539 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5540 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5541 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5542 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5543 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5544 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5545
5546 const unsigned StaticNumOps =
5547 Desc.getNumOperands() + Desc.implicit_uses().size();
5548 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5549
5550 // Require additional implicit operands. This allows a fixup done by the
5551 // post RA scheduler where the main implicit operand is killed and
5552 // implicit-defs are added for sub-registers that remain live after this
5553 // instruction.
5554 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5555 ErrInfo = "missing implicit register operands";
5556 return false;
5557 }
5558
5559 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5560 if (IsDst) {
5561 if (!Dst->isUse()) {
5562 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5563 return false;
5564 }
5565
5566 unsigned UseOpIdx;
5567 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5568 UseOpIdx != StaticNumOps + 1) {
5569 ErrInfo = "movrel implicit operands should be tied";
5570 return false;
5571 }
5572 }
5573
5574 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5575 const MachineOperand &ImpUse
5576 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5577 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5578 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5579 ErrInfo = "src0 should be subreg of implicit vector use";
5580 return false;
5581 }
5582 }
5583
5584 // Make sure we aren't losing exec uses in the td files. This mostly requires
5585 // being careful when using let Uses to try to add other use registers.
5586 if (shouldReadExec(MI)) {
5587 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5588 ErrInfo = "VALU instruction does not implicitly read exec mask";
5589 return false;
5590 }
5591 }
5592
5593 if (isSMRD(MI)) {
5594 if (MI.mayStore() &&
5595 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5596 // The register offset form of scalar stores may only use m0 as the
5597 // soffset register.
5598 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5599 if (Soff && Soff->getReg() != AMDGPU::M0) {
5600 ErrInfo = "scalar stores must use m0 as offset register";
5601 return false;
5602 }
5603 }
5604 }
5605
5606 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5607 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5608 if (Offset->getImm() != 0) {
5609 ErrInfo = "subtarget does not support offsets in flat instructions";
5610 return false;
5611 }
5612 }
5613
5614 if (isDS(MI) && !ST.hasGDS()) {
5615 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5616 if (GDSOp && GDSOp->getImm() != 0) {
5617 ErrInfo = "GDS is not supported on this subtarget";
5618 return false;
5619 }
5620 }
5621
5622 if (isImage(MI)) {
5623 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5624 if (DimOp) {
5625 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5626 AMDGPU::OpName::vaddr0);
5627 AMDGPU::OpName RSrcOpName =
5628 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5629 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5630 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5631 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5632 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5633 const AMDGPU::MIMGDimInfo *Dim =
5635
5636 if (!Dim) {
5637 ErrInfo = "dim is out of range";
5638 return false;
5639 }
5640
5641 bool IsA16 = false;
5642 if (ST.hasR128A16()) {
5643 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5644 IsA16 = R128A16->getImm() != 0;
5645 } else if (ST.hasA16()) {
5646 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5647 IsA16 = A16->getImm() != 0;
5648 }
5649
5650 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5651
5652 unsigned AddrWords =
5653 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5654
5655 unsigned VAddrWords;
5656 if (IsNSA) {
5657 VAddrWords = RsrcIdx - VAddr0Idx;
5658 if (ST.hasPartialNSAEncoding() &&
5659 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5660 unsigned LastVAddrIdx = RsrcIdx - 1;
5661 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5662 }
5663 } else {
5664 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5665 if (AddrWords > 12)
5666 AddrWords = 16;
5667 }
5668
5669 if (VAddrWords != AddrWords) {
5670 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5671 << " but got " << VAddrWords << "\n");
5672 ErrInfo = "bad vaddr size";
5673 return false;
5674 }
5675 }
5676 }
5677
5678 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5679 if (DppCt) {
5680 using namespace AMDGPU::DPP;
5681
5682 unsigned DC = DppCt->getImm();
5683 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5684 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5685 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5686 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5687 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5688 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5689 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5690 ErrInfo = "Invalid dpp_ctrl value";
5691 return false;
5692 }
5693 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5694 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5695 ErrInfo = "Invalid dpp_ctrl value: "
5696 "wavefront shifts are not supported on GFX10+";
5697 return false;
5698 }
5699 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5700 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5701 ErrInfo = "Invalid dpp_ctrl value: "
5702 "broadcasts are not supported on GFX10+";
5703 return false;
5704 }
5705 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5706 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5707 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5708 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5709 !ST.hasGFX90AInsts()) {
5710 ErrInfo = "Invalid dpp_ctrl value: "
5711 "row_newbroadcast/row_share is not supported before "
5712 "GFX90A/GFX10";
5713 return false;
5714 }
5715 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5716 ErrInfo = "Invalid dpp_ctrl value: "
5717 "row_share and row_xmask are not supported before GFX10";
5718 return false;
5719 }
5720 }
5721
5722 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5724 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5725 ErrInfo = "Invalid dpp_ctrl value: "
5726 "DP ALU dpp only support row_newbcast";
5727 return false;
5728 }
5729 }
5730
5731 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5732 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5733 AMDGPU::OpName DataName =
5734 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5735 const MachineOperand *Data = getNamedOperand(MI, DataName);
5736 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5737 if (Data && !Data->isReg())
5738 Data = nullptr;
5739
5740 if (ST.hasGFX90AInsts()) {
5741 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5742 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5743 ErrInfo = "Invalid register class: "
5744 "vdata and vdst should be both VGPR or AGPR";
5745 return false;
5746 }
5747 if (Data && Data2 &&
5748 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5749 ErrInfo = "Invalid register class: "
5750 "both data operands should be VGPR or AGPR";
5751 return false;
5752 }
5753 } else {
5754 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5755 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5756 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5757 ErrInfo = "Invalid register class: "
5758 "agpr loads and stores not supported on this GPU";
5759 return false;
5760 }
5761 }
5762 }
5763
5764 if (ST.needsAlignedVGPRs()) {
5765 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5767 if (!Op)
5768 return true;
5769 Register Reg = Op->getReg();
5770 if (Reg.isPhysical())
5771 return !(RI.getHWRegIndex(Reg) & 1);
5772 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5773 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5774 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5775 };
5776
5777 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5778 Opcode == AMDGPU::DS_GWS_BARRIER) {
5779
5780 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5781 ErrInfo = "Subtarget requires even aligned vector registers "
5782 "for DS_GWS instructions";
5783 return false;
5784 }
5785 }
5786
5787 if (isMIMG(MI)) {
5788 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5789 ErrInfo = "Subtarget requires even aligned vector registers "
5790 "for vaddr operand of image instructions";
5791 return false;
5792 }
5793 }
5794 }
5795
5796 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5797 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5798 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5799 ErrInfo = "Invalid register class: "
5800 "v_accvgpr_write with an SGPR is not supported on this GPU";
5801 return false;
5802 }
5803 }
5804
5805 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5806 const MachineOperand &SrcOp = MI.getOperand(1);
5807 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5808 ErrInfo = "pseudo expects only physical SGPRs";
5809 return false;
5810 }
5811 }
5812
5813 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5814 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5815 if (!ST.hasScaleOffset()) {
5816 ErrInfo = "Subtarget does not support offset scaling";
5817 return false;
5818 }
5819 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5820 ErrInfo = "Instruction does not support offset scaling";
5821 return false;
5822 }
5823 }
5824 }
5825
5826 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5827 // information.
5828 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5829 for (unsigned I = 0; I < 3; ++I) {
5831 return false;
5832 }
5833 }
5834
5835 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5836 MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
5837 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
5838 if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
5839 &AMDGPU::SReg_64RegClass) ||
5840 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5841 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5842 return false;
5843 }
5844 }
5845
5846 return true;
5847}
5848
5849// It is more readable to list mapped opcodes on the same line.
5850// clang-format off
5851
5853 switch (MI.getOpcode()) {
5854 default: return AMDGPU::INSTRUCTION_LIST_END;
5855 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5856 case AMDGPU::COPY: return AMDGPU::COPY;
5857 case AMDGPU::PHI: return AMDGPU::PHI;
5858 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5859 case AMDGPU::WQM: return AMDGPU::WQM;
5860 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5861 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5862 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5863 case AMDGPU::S_MOV_B32: {
5864 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5865 return MI.getOperand(1).isReg() ||
5866 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5867 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5868 }
5869 case AMDGPU::S_ADD_I32:
5870 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5871 case AMDGPU::S_ADDC_U32:
5872 return AMDGPU::V_ADDC_U32_e32;
5873 case AMDGPU::S_SUB_I32:
5874 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5875 // FIXME: These are not consistently handled, and selected when the carry is
5876 // used.
5877 case AMDGPU::S_ADD_U32:
5878 return AMDGPU::V_ADD_CO_U32_e32;
5879 case AMDGPU::S_SUB_U32:
5880 return AMDGPU::V_SUB_CO_U32_e32;
5881 case AMDGPU::S_ADD_U64_PSEUDO:
5882 return AMDGPU::V_ADD_U64_PSEUDO;
5883 case AMDGPU::S_SUB_U64_PSEUDO:
5884 return AMDGPU::V_SUB_U64_PSEUDO;
5885 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5886 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5887 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5888 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5889 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5890 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5891 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5892 case AMDGPU::S_XNOR_B32:
5893 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5894 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5895 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5896 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5897 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5898 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5899 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5900 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5901 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5902 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5903 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5904 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5905 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5906 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5907 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5908 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5909 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5910 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5911 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5912 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5913 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5914 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5915 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5916 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5917 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5918 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5919 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5920 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5921 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5922 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5923 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5924 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5925 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5926 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5927 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5928 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5929 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5930 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5931 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5932 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5933 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5934 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5935 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5936 case AMDGPU::S_CVT_F32_F16:
5937 case AMDGPU::S_CVT_HI_F32_F16:
5938 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5939 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5940 case AMDGPU::S_CVT_F16_F32:
5941 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5942 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5943 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5944 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5945 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5946 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5947 case AMDGPU::S_CEIL_F16:
5948 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5949 : AMDGPU::V_CEIL_F16_fake16_e64;
5950 case AMDGPU::S_FLOOR_F16:
5951 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5952 : AMDGPU::V_FLOOR_F16_fake16_e64;
5953 case AMDGPU::S_TRUNC_F16:
5954 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
5955 : AMDGPU::V_TRUNC_F16_fake16_e64;
5956 case AMDGPU::S_RNDNE_F16:
5957 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
5958 : AMDGPU::V_RNDNE_F16_fake16_e64;
5959 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5960 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5961 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5962 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5963 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5964 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5965 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5966 case AMDGPU::S_ADD_F16:
5967 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
5968 : AMDGPU::V_ADD_F16_fake16_e64;
5969 case AMDGPU::S_SUB_F16:
5970 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
5971 : AMDGPU::V_SUB_F16_fake16_e64;
5972 case AMDGPU::S_MIN_F16:
5973 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
5974 : AMDGPU::V_MIN_F16_fake16_e64;
5975 case AMDGPU::S_MAX_F16:
5976 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
5977 : AMDGPU::V_MAX_F16_fake16_e64;
5978 case AMDGPU::S_MINIMUM_F16:
5979 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5980 : AMDGPU::V_MINIMUM_F16_fake16_e64;
5981 case AMDGPU::S_MAXIMUM_F16:
5982 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5983 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
5984 case AMDGPU::S_MUL_F16:
5985 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
5986 : AMDGPU::V_MUL_F16_fake16_e64;
5987 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5988 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5989 case AMDGPU::S_FMAC_F16:
5990 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5991 : AMDGPU::V_FMAC_F16_fake16_e64;
5992 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5993 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5994 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5995 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5996 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5997 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5998 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5999 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6000 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6001 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6002 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6003 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6004 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6005 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6006 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6007 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6008 case AMDGPU::S_CMP_LT_F16:
6009 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6010 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6011 case AMDGPU::S_CMP_EQ_F16:
6012 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6013 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6014 case AMDGPU::S_CMP_LE_F16:
6015 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6016 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6017 case AMDGPU::S_CMP_GT_F16:
6018 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6019 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6020 case AMDGPU::S_CMP_LG_F16:
6021 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6022 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6023 case AMDGPU::S_CMP_GE_F16:
6024 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6025 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6026 case AMDGPU::S_CMP_O_F16:
6027 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6028 : AMDGPU::V_CMP_O_F16_fake16_e64;
6029 case AMDGPU::S_CMP_U_F16:
6030 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6031 : AMDGPU::V_CMP_U_F16_fake16_e64;
6032 case AMDGPU::S_CMP_NGE_F16:
6033 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6034 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6035 case AMDGPU::S_CMP_NLG_F16:
6036 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6037 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6038 case AMDGPU::S_CMP_NGT_F16:
6039 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6040 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6041 case AMDGPU::S_CMP_NLE_F16:
6042 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6043 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6044 case AMDGPU::S_CMP_NEQ_F16:
6045 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6046 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6047 case AMDGPU::S_CMP_NLT_F16:
6048 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6049 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6050 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6051 case AMDGPU::V_S_EXP_F16_e64:
6052 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6053 : AMDGPU::V_EXP_F16_fake16_e64;
6054 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6055 case AMDGPU::V_S_LOG_F16_e64:
6056 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6057 : AMDGPU::V_LOG_F16_fake16_e64;
6058 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6059 case AMDGPU::V_S_RCP_F16_e64:
6060 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6061 : AMDGPU::V_RCP_F16_fake16_e64;
6062 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6063 case AMDGPU::V_S_RSQ_F16_e64:
6064 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6065 : AMDGPU::V_RSQ_F16_fake16_e64;
6066 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6067 case AMDGPU::V_S_SQRT_F16_e64:
6068 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6069 : AMDGPU::V_SQRT_F16_fake16_e64;
6070 }
6072 "Unexpected scalar opcode without corresponding vector one!");
6073}
6074
6075// clang-format on
6076
6080 const DebugLoc &DL, Register Reg,
6081 bool IsSCCLive,
6082 SlotIndexes *Indexes) const {
6083 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6084 const SIInstrInfo *TII = ST.getInstrInfo();
6086 if (IsSCCLive) {
6087 // Insert two move instructions, one to save the original value of EXEC and
6088 // the other to turn on all bits in EXEC. This is required as we can't use
6089 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6090 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
6092 auto FlipExecMI =
6093 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
6094 if (Indexes) {
6095 Indexes->insertMachineInstrInMaps(*StoreExecMI);
6096 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6097 }
6098 } else {
6099 auto SaveExec =
6100 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6101 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6102 if (Indexes)
6103 Indexes->insertMachineInstrInMaps(*SaveExec);
6104 }
6105}
6106
6109 const DebugLoc &DL, Register Reg,
6110 SlotIndexes *Indexes) const {
6112 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6113 .addReg(Reg, RegState::Kill);
6114 if (Indexes)
6115 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6116}
6117
6121 "Not a whole wave func");
6122 MachineBasicBlock &MBB = *MF.begin();
6123 for (MachineInstr &MI : MBB)
6124 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6125 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6126 return &MI;
6127
6128 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6129}
6130
6132 unsigned OpNo) const {
6133 const MCInstrDesc &Desc = get(MI.getOpcode());
6134 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6135 Desc.operands()[OpNo].RegClass == -1) {
6136 Register Reg = MI.getOperand(OpNo).getReg();
6137
6138 if (Reg.isVirtual()) {
6139 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6140 return MRI.getRegClass(Reg);
6141 }
6142 return RI.getPhysRegBaseClass(Reg);
6143 }
6144
6145 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6146 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6147}
6148
6151 MachineBasicBlock *MBB = MI.getParent();
6152 MachineOperand &MO = MI.getOperand(OpIdx);
6153 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6154 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6155 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6156 unsigned Size = RI.getRegSizeInBits(*RC);
6157 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6158 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6159 : AMDGPU::V_MOV_B32_e32;
6160 if (MO.isReg())
6161 Opcode = AMDGPU::COPY;
6162 else if (RI.isSGPRClass(RC))
6163 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6164
6165 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6166 Register Reg = MRI.createVirtualRegister(VRC);
6167 DebugLoc DL = MBB->findDebugLoc(I);
6168 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6169 MO.ChangeToRegister(Reg, false);
6170}
6171
6174 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6175 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6176 if (!SuperReg.getReg().isVirtual())
6177 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6178
6179 MachineBasicBlock *MBB = MI->getParent();
6180 const DebugLoc &DL = MI->getDebugLoc();
6181 Register SubReg = MRI.createVirtualRegister(SubRC);
6182
6183 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6184 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6185 .addReg(SuperReg.getReg(), 0, NewSubIdx);
6186 return SubReg;
6187}
6188
6191 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6192 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6193 if (Op.isImm()) {
6194 if (SubIdx == AMDGPU::sub0)
6195 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6196 if (SubIdx == AMDGPU::sub1)
6197 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6198
6199 llvm_unreachable("Unhandled register index for immediate");
6200 }
6201
6202 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6203 SubIdx, SubRC);
6204 return MachineOperand::CreateReg(SubReg, false);
6205}
6206
6207// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6208void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6209 assert(Inst.getNumExplicitOperands() == 3);
6210 MachineOperand Op1 = Inst.getOperand(1);
6211 Inst.removeOperand(1);
6212 Inst.addOperand(Op1);
6213}
6214
6216 const MCOperandInfo &OpInfo,
6217 const MachineOperand &MO) const {
6218 if (!MO.isReg())
6219 return false;
6220
6221 Register Reg = MO.getReg();
6222
6223 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6224 if (Reg.isPhysical())
6225 return DRC->contains(Reg);
6226
6227 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6228
6229 if (MO.getSubReg()) {
6230 const MachineFunction *MF = MO.getParent()->getMF();
6231 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6232 if (!SuperRC)
6233 return false;
6234 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6235 }
6236
6237 return RI.getCommonSubClass(DRC, RC) != nullptr;
6238}
6239
6241 const MachineOperand &MO) const {
6242 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6243 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6244 unsigned Opc = MI.getOpcode();
6245
6246 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6247 // information.
6248 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6249 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6250 constexpr AMDGPU::OpName OpNames[] = {
6251 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6252
6253 for (auto [I, OpName] : enumerate(OpNames)) {
6254 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6255 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6257 return false;
6258 }
6259 }
6260
6261 if (!isLegalRegOperand(MRI, OpInfo, MO))
6262 return false;
6263
6264 // check Accumulate GPR operand
6265 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6266 if (IsAGPR && !ST.hasMAIInsts())
6267 return false;
6268 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6269 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6270 return false;
6271 // Atomics should have both vdst and vdata either vgpr or agpr.
6272 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6273 const int DataIdx = AMDGPU::getNamedOperandIdx(
6274 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6275 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6276 MI.getOperand(DataIdx).isReg() &&
6277 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6278 return false;
6279 if ((int)OpIdx == DataIdx) {
6280 if (VDstIdx != -1 &&
6281 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6282 return false;
6283 // DS instructions with 2 src operands also must have tied RC.
6284 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6285 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6286 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6287 return false;
6288 }
6289
6290 // Check V_ACCVGPR_WRITE_B32_e64
6291 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6292 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6293 RI.isSGPRReg(MRI, MO.getReg()))
6294 return false;
6295
6296 if (ST.hasFlatScratchHiInB64InstHazard() &&
6297 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6298 if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
6299 if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
6300 64)
6301 return false;
6302 }
6303 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6304 return false;
6305 }
6306
6307 return true;
6308}
6309
6311 const MCOperandInfo &OpInfo,
6312 const MachineOperand &MO) const {
6313 if (MO.isReg())
6314 return isLegalRegOperand(MRI, OpInfo, MO);
6315
6316 // Handle non-register types that are treated like immediates.
6317 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6318 return true;
6319}
6320
6322 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6323 const MachineOperand *MO) const {
6324 constexpr unsigned NumOps = 3;
6325 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6326 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6327 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6328 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6329
6330 assert(SrcN < NumOps);
6331
6332 if (!MO) {
6333 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6334 if (SrcIdx == -1)
6335 return true;
6336 MO = &MI.getOperand(SrcIdx);
6337 }
6338
6339 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6340 return true;
6341
6342 int ModsIdx =
6343 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6344 if (ModsIdx == -1)
6345 return true;
6346
6347 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6348 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6349 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6350
6351 return !OpSel && !OpSelHi;
6352}
6353
6355 const MachineOperand *MO) const {
6356 const MachineFunction &MF = *MI.getMF();
6357 const MachineRegisterInfo &MRI = MF.getRegInfo();
6358 const MCInstrDesc &InstDesc = MI.getDesc();
6359 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6360 int64_t RegClass = getOpRegClassID(OpInfo);
6361 const TargetRegisterClass *DefinedRC =
6362 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6363 if (!MO)
6364 MO = &MI.getOperand(OpIdx);
6365
6366 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6367
6368 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6369 const MachineOperand *UsedLiteral = nullptr;
6370
6371 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6372 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6373
6374 // TODO: Be more permissive with frame indexes.
6375 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6376 if (!LiteralLimit--)
6377 return false;
6378
6379 UsedLiteral = MO;
6380 }
6381
6383 if (MO->isReg())
6384 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6385
6386 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6387 if (i == OpIdx)
6388 continue;
6389 const MachineOperand &Op = MI.getOperand(i);
6390 if (Op.isReg()) {
6391 if (Op.isUse()) {
6392 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6393 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6394 if (--ConstantBusLimit <= 0)
6395 return false;
6396 }
6397 }
6398 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6399 !isInlineConstant(Op, InstDesc.operands()[i])) {
6400 // The same literal may be used multiple times.
6401 if (!UsedLiteral)
6402 UsedLiteral = &Op;
6403 else if (UsedLiteral->isIdenticalTo(Op))
6404 continue;
6405
6406 if (!LiteralLimit--)
6407 return false;
6408 if (--ConstantBusLimit <= 0)
6409 return false;
6410 }
6411 }
6412 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6413 // There can be at most one literal operand, but it can be repeated.
6414 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6415 if (i == OpIdx)
6416 continue;
6417 const MachineOperand &Op = MI.getOperand(i);
6418 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6419 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6420 !Op.isIdenticalTo(*MO))
6421 return false;
6422
6423 // Do not fold a non-inlineable and non-register operand into an
6424 // instruction that already has a frame index. The frame index handling
6425 // code could not handle well when a frame index co-exists with another
6426 // non-register operand, unless that operand is an inlineable immediate.
6427 if (Op.isFI())
6428 return false;
6429 }
6430 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6431 isF16PseudoScalarTrans(MI.getOpcode())) {
6432 return false;
6433 }
6434
6435 if (MO->isReg()) {
6436 if (!DefinedRC)
6437 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6438 return isLegalRegOperand(MI, OpIdx, *MO);
6439 }
6440
6441 if (MO->isImm()) {
6442 uint64_t Imm = MO->getImm();
6443 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6444 bool Is64BitOp = Is64BitFPOp ||
6445 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6446 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6447 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6448 if (Is64BitOp &&
6449 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6450 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6451 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6452 return false;
6453
6454 // FIXME: We can use sign extended 64-bit literals, but only for signed
6455 // operands. At the moment we do not know if an operand is signed.
6456 // Such operand will be encoded as its low 32 bits and then either
6457 // correctly sign extended or incorrectly zero extended by HW.
6458 // If 64-bit literals are supported and the literal will be encoded
6459 // as full 64 bit we still can use it.
6460 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6461 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6462 return false;
6463 }
6464 }
6465
6466 // Handle non-register types that are treated like immediates.
6467 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6468
6469 if (!DefinedRC) {
6470 // This operand expects an immediate.
6471 return true;
6472 }
6473
6474 return isImmOperandLegal(MI, OpIdx, *MO);
6475}
6476
6478 bool IsGFX950Only = ST.hasGFX950Insts();
6479 bool IsGFX940Only = ST.hasGFX940Insts();
6480
6481 if (!IsGFX950Only && !IsGFX940Only)
6482 return false;
6483
6484 if (!isVALU(MI))
6485 return false;
6486
6487 // V_COS, V_EXP, V_RCP, etc.
6488 if (isTRANS(MI))
6489 return true;
6490
6491 // DOT2, DOT2C, DOT4, etc.
6492 if (isDOT(MI))
6493 return true;
6494
6495 // MFMA, SMFMA
6496 if (isMFMA(MI))
6497 return true;
6498
6499 unsigned Opcode = MI.getOpcode();
6500 switch (Opcode) {
6501 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6502 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6503 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6504 case AMDGPU::V_MQSAD_U32_U8_e64:
6505 case AMDGPU::V_PK_ADD_F16:
6506 case AMDGPU::V_PK_ADD_F32:
6507 case AMDGPU::V_PK_ADD_I16:
6508 case AMDGPU::V_PK_ADD_U16:
6509 case AMDGPU::V_PK_ASHRREV_I16:
6510 case AMDGPU::V_PK_FMA_F16:
6511 case AMDGPU::V_PK_FMA_F32:
6512 case AMDGPU::V_PK_FMAC_F16_e32:
6513 case AMDGPU::V_PK_FMAC_F16_e64:
6514 case AMDGPU::V_PK_LSHLREV_B16:
6515 case AMDGPU::V_PK_LSHRREV_B16:
6516 case AMDGPU::V_PK_MAD_I16:
6517 case AMDGPU::V_PK_MAD_U16:
6518 case AMDGPU::V_PK_MAX_F16:
6519 case AMDGPU::V_PK_MAX_I16:
6520 case AMDGPU::V_PK_MAX_U16:
6521 case AMDGPU::V_PK_MIN_F16:
6522 case AMDGPU::V_PK_MIN_I16:
6523 case AMDGPU::V_PK_MIN_U16:
6524 case AMDGPU::V_PK_MOV_B32:
6525 case AMDGPU::V_PK_MUL_F16:
6526 case AMDGPU::V_PK_MUL_F32:
6527 case AMDGPU::V_PK_MUL_LO_U16:
6528 case AMDGPU::V_PK_SUB_I16:
6529 case AMDGPU::V_PK_SUB_U16:
6530 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6531 return true;
6532 default:
6533 return false;
6534 }
6535}
6536
6538 MachineInstr &MI) const {
6539 unsigned Opc = MI.getOpcode();
6540 const MCInstrDesc &InstrDesc = get(Opc);
6541
6542 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6543 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6544
6545 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6546 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6547
6548 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6549 // we need to only have one constant bus use before GFX10.
6550 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6551 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6552 RI.isSGPRReg(MRI, Src0.getReg()))
6553 legalizeOpWithMove(MI, Src0Idx);
6554
6555 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6556 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6557 // src0/src1 with V_READFIRSTLANE.
6558 if (Opc == AMDGPU::V_WRITELANE_B32) {
6559 const DebugLoc &DL = MI.getDebugLoc();
6560 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6561 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6562 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6563 .add(Src0);
6564 Src0.ChangeToRegister(Reg, false);
6565 }
6566 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6567 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6568 const DebugLoc &DL = MI.getDebugLoc();
6569 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6570 .add(Src1);
6571 Src1.ChangeToRegister(Reg, false);
6572 }
6573 return;
6574 }
6575
6576 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6577 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6578 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6579 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6580 legalizeOpWithMove(MI, Src2Idx);
6581 }
6582
6583 // VOP2 src0 instructions support all operand types, so we don't need to check
6584 // their legality. If src1 is already legal, we don't need to do anything.
6585 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6586 return;
6587
6588 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6589 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6590 // select is uniform.
6591 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6592 RI.isVGPR(MRI, Src1.getReg())) {
6593 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6594 const DebugLoc &DL = MI.getDebugLoc();
6595 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6596 .add(Src1);
6597 Src1.ChangeToRegister(Reg, false);
6598 return;
6599 }
6600
6601 // We do not use commuteInstruction here because it is too aggressive and will
6602 // commute if it is possible. We only want to commute here if it improves
6603 // legality. This can be called a fairly large number of times so don't waste
6604 // compile time pointlessly swapping and checking legality again.
6605 if (HasImplicitSGPR || !MI.isCommutable()) {
6606 legalizeOpWithMove(MI, Src1Idx);
6607 return;
6608 }
6609
6610 // If src0 can be used as src1, commuting will make the operands legal.
6611 // Otherwise we have to give up and insert a move.
6612 //
6613 // TODO: Other immediate-like operand kinds could be commuted if there was a
6614 // MachineOperand::ChangeTo* for them.
6615 if ((!Src1.isImm() && !Src1.isReg()) ||
6616 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6617 legalizeOpWithMove(MI, Src1Idx);
6618 return;
6619 }
6620
6621 int CommutedOpc = commuteOpcode(MI);
6622 if (CommutedOpc == -1) {
6623 legalizeOpWithMove(MI, Src1Idx);
6624 return;
6625 }
6626
6627 MI.setDesc(get(CommutedOpc));
6628
6629 Register Src0Reg = Src0.getReg();
6630 unsigned Src0SubReg = Src0.getSubReg();
6631 bool Src0Kill = Src0.isKill();
6632
6633 if (Src1.isImm())
6634 Src0.ChangeToImmediate(Src1.getImm());
6635 else if (Src1.isReg()) {
6636 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6637 Src0.setSubReg(Src1.getSubReg());
6638 } else
6639 llvm_unreachable("Should only have register or immediate operands");
6640
6641 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6642 Src1.setSubReg(Src0SubReg);
6644}
6645
6646// Legalize VOP3 operands. All operand types are supported for any operand
6647// but only one literal constant and only starting from GFX10.
6649 MachineInstr &MI) const {
6650 unsigned Opc = MI.getOpcode();
6651
6652 int VOP3Idx[3] = {
6653 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6654 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6655 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6656 };
6657
6658 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6659 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6660 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6661 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6662 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6663 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6664 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6665 // src1 and src2 must be scalar
6666 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6667 const DebugLoc &DL = MI.getDebugLoc();
6668 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6669 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6670 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6671 .add(Src1);
6672 Src1.ChangeToRegister(Reg, false);
6673 }
6674 if (VOP3Idx[2] != -1) {
6675 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6676 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6677 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6678 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6679 .add(Src2);
6680 Src2.ChangeToRegister(Reg, false);
6681 }
6682 }
6683 }
6684
6685 // Find the one SGPR operand we are allowed to use.
6686 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6687 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6688 SmallDenseSet<unsigned> SGPRsUsed;
6689 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6690 if (SGPRReg) {
6691 SGPRsUsed.insert(SGPRReg);
6692 --ConstantBusLimit;
6693 }
6694
6695 for (int Idx : VOP3Idx) {
6696 if (Idx == -1)
6697 break;
6698 MachineOperand &MO = MI.getOperand(Idx);
6699
6700 if (!MO.isReg()) {
6701 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6702 continue;
6703
6704 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6705 --LiteralLimit;
6706 --ConstantBusLimit;
6707 continue;
6708 }
6709
6710 --LiteralLimit;
6711 --ConstantBusLimit;
6712 legalizeOpWithMove(MI, Idx);
6713 continue;
6714 }
6715
6716 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6717 continue; // VGPRs are legal
6718
6719 // We can use one SGPR in each VOP3 instruction prior to GFX10
6720 // and two starting from GFX10.
6721 if (SGPRsUsed.count(MO.getReg()))
6722 continue;
6723 if (ConstantBusLimit > 0) {
6724 SGPRsUsed.insert(MO.getReg());
6725 --ConstantBusLimit;
6726 continue;
6727 }
6728
6729 // If we make it this far, then the operand is not legal and we must
6730 // legalize it.
6731 legalizeOpWithMove(MI, Idx);
6732 }
6733
6734 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6735 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6736 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6737 legalizeOpWithMove(MI, VOP3Idx[2]);
6738
6739 // Fix the register class of packed FP32 instructions on gfx12+. See
6740 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6742 for (unsigned I = 0; I < 3; ++I) {
6744 legalizeOpWithMove(MI, VOP3Idx[I]);
6745 }
6746 }
6747}
6748
6751 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6752 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6753 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6754 if (DstRC)
6755 SRC = RI.getCommonSubClass(SRC, DstRC);
6756
6757 Register DstReg = MRI.createVirtualRegister(SRC);
6758 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6759
6760 if (RI.hasAGPRs(VRC)) {
6761 VRC = RI.getEquivalentVGPRClass(VRC);
6762 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6763 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6764 get(TargetOpcode::COPY), NewSrcReg)
6765 .addReg(SrcReg);
6766 SrcReg = NewSrcReg;
6767 }
6768
6769 if (SubRegs == 1) {
6770 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6771 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6772 .addReg(SrcReg);
6773 return DstReg;
6774 }
6775
6777 for (unsigned i = 0; i < SubRegs; ++i) {
6778 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6779 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6780 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6781 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6782 SRegs.push_back(SGPR);
6783 }
6784
6786 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6787 get(AMDGPU::REG_SEQUENCE), DstReg);
6788 for (unsigned i = 0; i < SubRegs; ++i) {
6789 MIB.addReg(SRegs[i]);
6790 MIB.addImm(RI.getSubRegFromChannel(i));
6791 }
6792 return DstReg;
6793}
6794
6796 MachineInstr &MI) const {
6797
6798 // If the pointer is store in VGPRs, then we need to move them to
6799 // SGPRs using v_readfirstlane. This is safe because we only select
6800 // loads with uniform pointers to SMRD instruction so we know the
6801 // pointer value is uniform.
6802 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6803 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6804 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6805 SBase->setReg(SGPR);
6806 }
6807 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6808 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6809 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6810 SOff->setReg(SGPR);
6811 }
6812}
6813
6815 unsigned Opc = Inst.getOpcode();
6816 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6817 if (OldSAddrIdx < 0)
6818 return false;
6819
6820 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6821
6822 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6823 if (NewOpc < 0)
6825 if (NewOpc < 0)
6826 return false;
6827
6829 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6830 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6831 return false;
6832
6833 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6834 if (NewVAddrIdx < 0)
6835 return false;
6836
6837 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6838
6839 // Check vaddr, it shall be zero or absent.
6840 MachineInstr *VAddrDef = nullptr;
6841 if (OldVAddrIdx >= 0) {
6842 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6843 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6844 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6845 !VAddrDef->getOperand(1).isImm() ||
6846 VAddrDef->getOperand(1).getImm() != 0)
6847 return false;
6848 }
6849
6850 const MCInstrDesc &NewDesc = get(NewOpc);
6851 Inst.setDesc(NewDesc);
6852
6853 // Callers expect iterator to be valid after this call, so modify the
6854 // instruction in place.
6855 if (OldVAddrIdx == NewVAddrIdx) {
6856 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6857 // Clear use list from the old vaddr holding a zero register.
6858 MRI.removeRegOperandFromUseList(&NewVAddr);
6859 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6860 Inst.removeOperand(OldSAddrIdx);
6861 // Update the use list with the pointer we have just moved from vaddr to
6862 // saddr position. Otherwise new vaddr will be missing from the use list.
6863 MRI.removeRegOperandFromUseList(&NewVAddr);
6864 MRI.addRegOperandToUseList(&NewVAddr);
6865 } else {
6866 assert(OldSAddrIdx == NewVAddrIdx);
6867
6868 if (OldVAddrIdx >= 0) {
6869 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6870 AMDGPU::OpName::vdst_in);
6871
6872 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6873 // it asserts. Untie the operands for now and retie them afterwards.
6874 if (NewVDstIn != -1) {
6875 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6876 Inst.untieRegOperand(OldVDstIn);
6877 }
6878
6879 Inst.removeOperand(OldVAddrIdx);
6880
6881 if (NewVDstIn != -1) {
6882 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6883 Inst.tieOperands(NewVDst, NewVDstIn);
6884 }
6885 }
6886 }
6887
6888 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6889 VAddrDef->eraseFromParent();
6890
6891 return true;
6892}
6893
6894// FIXME: Remove this when SelectionDAG is obsoleted.
6896 MachineInstr &MI) const {
6897 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6898 return;
6899
6900 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6901 // thinks they are uniform, so a readfirstlane should be valid.
6902 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6903 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6904 return;
6905
6907 return;
6908
6909 const TargetRegisterClass *DeclaredRC =
6910 getRegClass(MI.getDesc(), SAddr->getOperandNo());
6911
6912 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6913 SAddr->setReg(ToSGPR);
6914}
6915
6918 const TargetRegisterClass *DstRC,
6921 const DebugLoc &DL) const {
6922 Register OpReg = Op.getReg();
6923 unsigned OpSubReg = Op.getSubReg();
6924
6925 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6926 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6927
6928 // Check if operand is already the correct register class.
6929 if (DstRC == OpRC)
6930 return;
6931
6932 Register DstReg = MRI.createVirtualRegister(DstRC);
6933 auto Copy =
6934 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6935 Op.setReg(DstReg);
6936
6937 MachineInstr *Def = MRI.getVRegDef(OpReg);
6938 if (!Def)
6939 return;
6940
6941 // Try to eliminate the copy if it is copying an immediate value.
6942 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6943 foldImmediate(*Copy, *Def, OpReg, &MRI);
6944
6945 bool ImpDef = Def->isImplicitDef();
6946 while (!ImpDef && Def && Def->isCopy()) {
6947 if (Def->getOperand(1).getReg().isPhysical())
6948 break;
6949 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6950 ImpDef = Def && Def->isImplicitDef();
6951 }
6952 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6953 !ImpDef)
6954 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6955}
6956
6957// Emit the actual waterfall loop, executing the wrapped instruction for each
6958// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6959// iteration, in the worst case we execute 64 (once per lane).
6960static void
6963 MachineBasicBlock &LoopBB,
6964 MachineBasicBlock &BodyBB,
6965 const DebugLoc &DL,
6966 ArrayRef<MachineOperand *> ScalarOps) {
6967 MachineFunction &MF = *LoopBB.getParent();
6968 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6969 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6971 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6972
6974 Register CondReg;
6975
6976 for (MachineOperand *ScalarOp : ScalarOps) {
6977 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6978 unsigned NumSubRegs = RegSize / 32;
6979 Register VScalarOp = ScalarOp->getReg();
6980
6981 if (NumSubRegs == 1) {
6982 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6983
6984 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6985 .addReg(VScalarOp);
6986
6987 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6988
6989 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6990 .addReg(CurReg)
6991 .addReg(VScalarOp);
6992
6993 // Combine the comparison results with AND.
6994 if (!CondReg) // First.
6995 CondReg = NewCondReg;
6996 else { // If not the first, we create an AND.
6997 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6998 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
6999 .addReg(CondReg)
7000 .addReg(NewCondReg);
7001 CondReg = AndReg;
7002 }
7003
7004 // Update ScalarOp operand to use the SGPR ScalarOp.
7005 ScalarOp->setReg(CurReg);
7006 ScalarOp->setIsKill();
7007 } else {
7008 SmallVector<Register, 8> ReadlanePieces;
7009 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
7010 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7011 "Unhandled register size");
7012
7013 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7014 Register CurRegLo =
7015 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7016 Register CurRegHi =
7017 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7018
7019 // Read the next variant <- also loop target.
7020 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
7021 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
7022
7023 // Read the next variant <- also loop target.
7024 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
7025 .addReg(VScalarOp, VScalarOpUndef,
7026 TRI->getSubRegFromChannel(Idx + 1));
7027
7028 ReadlanePieces.push_back(CurRegLo);
7029 ReadlanePieces.push_back(CurRegHi);
7030
7031 // Comparison is to be done as 64-bit.
7032 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
7033 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
7034 .addReg(CurRegLo)
7035 .addImm(AMDGPU::sub0)
7036 .addReg(CurRegHi)
7037 .addImm(AMDGPU::sub1);
7038
7039 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7040 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
7041 NewCondReg)
7042 .addReg(CurReg);
7043 if (NumSubRegs <= 2)
7044 Cmp.addReg(VScalarOp);
7045 else
7046 Cmp.addReg(VScalarOp, VScalarOpUndef,
7047 TRI->getSubRegFromChannel(Idx, 2));
7048
7049 // Combine the comparison results with AND.
7050 if (!CondReg) // First.
7051 CondReg = NewCondReg;
7052 else { // If not the first, we create an AND.
7053 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7054 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7055 .addReg(CondReg)
7056 .addReg(NewCondReg);
7057 CondReg = AndReg;
7058 }
7059 } // End for loop.
7060
7061 const auto *SScalarOpRC =
7062 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
7063 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
7064
7065 // Build scalar ScalarOp.
7066 auto Merge =
7067 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7068 unsigned Channel = 0;
7069 for (Register Piece : ReadlanePieces) {
7070 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7071 }
7072
7073 // Update ScalarOp operand to use the SGPR ScalarOp.
7074 ScalarOp->setReg(SScalarOp);
7075 ScalarOp->setIsKill();
7076 }
7077 }
7078
7079 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7080 MRI.setSimpleHint(SaveExec, CondReg);
7081
7082 // Update EXEC to matching lanes, saving original to SaveExec.
7083 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7084 .addReg(CondReg, RegState::Kill);
7085
7086 // The original instruction is here; we insert the terminators after it.
7087 I = BodyBB.end();
7088
7089 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7090 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7091 .addReg(LMC.ExecReg)
7092 .addReg(SaveExec);
7093
7094 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7095}
7096
7097// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7098// with SGPRs by iterating over all unique values across all lanes.
7099// Returns the loop basic block that now contains \p MI.
7100static MachineBasicBlock *
7104 MachineBasicBlock::iterator Begin = nullptr,
7105 MachineBasicBlock::iterator End = nullptr) {
7106 MachineBasicBlock &MBB = *MI.getParent();
7107 MachineFunction &MF = *MBB.getParent();
7108 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7109 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7111 if (!Begin.isValid())
7112 Begin = &MI;
7113 if (!End.isValid()) {
7114 End = &MI;
7115 ++End;
7116 }
7117 const DebugLoc &DL = MI.getDebugLoc();
7119 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7120
7121 // Save SCC. Waterfall Loop may overwrite SCC.
7122 Register SaveSCCReg;
7123
7124 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7125 // rather than unlimited scan everywhere
7126 bool SCCNotDead =
7127 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7128 std::numeric_limits<unsigned>::max()) !=
7130 if (SCCNotDead) {
7131 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7132 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7133 .addImm(1)
7134 .addImm(0);
7135 }
7136
7137 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7138
7139 // Save the EXEC mask
7140 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7141
7142 // Killed uses in the instruction we are waterfalling around will be
7143 // incorrect due to the added control-flow.
7145 ++AfterMI;
7146 for (auto I = Begin; I != AfterMI; I++) {
7147 for (auto &MO : I->all_uses())
7148 MRI.clearKillFlags(MO.getReg());
7149 }
7150
7151 // To insert the loop we need to split the block. Move everything after this
7152 // point to a new block, and insert a new empty block between the two.
7155 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7157 ++MBBI;
7158
7159 MF.insert(MBBI, LoopBB);
7160 MF.insert(MBBI, BodyBB);
7161 MF.insert(MBBI, RemainderBB);
7162
7163 LoopBB->addSuccessor(BodyBB);
7164 BodyBB->addSuccessor(LoopBB);
7165 BodyBB->addSuccessor(RemainderBB);
7166
7167 // Move Begin to MI to the BodyBB, and the remainder of the block to
7168 // RemainderBB.
7169 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7170 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7171 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7172
7173 MBB.addSuccessor(LoopBB);
7174
7175 // Update dominators. We know that MBB immediately dominates LoopBB, that
7176 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7177 // RemainderBB. RemainderBB immediately dominates all of the successors
7178 // transferred to it from MBB that MBB used to properly dominate.
7179 if (MDT) {
7180 MDT->addNewBlock(LoopBB, &MBB);
7181 MDT->addNewBlock(BodyBB, LoopBB);
7182 MDT->addNewBlock(RemainderBB, BodyBB);
7183 for (auto &Succ : RemainderBB->successors()) {
7184 if (MDT->properlyDominates(&MBB, Succ)) {
7185 MDT->changeImmediateDominator(Succ, RemainderBB);
7186 }
7187 }
7188 }
7189
7190 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7191
7192 MachineBasicBlock::iterator First = RemainderBB->begin();
7193 // Restore SCC
7194 if (SCCNotDead) {
7195 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7196 .addReg(SaveSCCReg, RegState::Kill)
7197 .addImm(0);
7198 }
7199
7200 // Restore the EXEC mask
7201 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7202 .addReg(SaveExec);
7203 return BodyBB;
7204}
7205
7206// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7207static std::tuple<unsigned, unsigned>
7209 MachineBasicBlock &MBB = *MI.getParent();
7210 MachineFunction &MF = *MBB.getParent();
7212
7213 // Extract the ptr from the resource descriptor.
7214 unsigned RsrcPtr =
7215 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7216 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7217
7218 // Create an empty resource descriptor
7219 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7220 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7221 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7222 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7223 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7224
7225 // Zero64 = 0
7226 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7227 .addImm(0);
7228
7229 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7230 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7231 .addImm(Lo_32(RsrcDataFormat));
7232
7233 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7234 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7235 .addImm(Hi_32(RsrcDataFormat));
7236
7237 // NewSRsrc = {Zero64, SRsrcFormat}
7238 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7239 .addReg(Zero64)
7240 .addImm(AMDGPU::sub0_sub1)
7241 .addReg(SRsrcFormatLo)
7242 .addImm(AMDGPU::sub2)
7243 .addReg(SRsrcFormatHi)
7244 .addImm(AMDGPU::sub3);
7245
7246 return std::tuple(RsrcPtr, NewSRsrc);
7247}
7248
7251 MachineDominatorTree *MDT) const {
7252 MachineFunction &MF = *MI.getMF();
7254 MachineBasicBlock *CreatedBB = nullptr;
7255
7256 // Legalize VOP2
7257 if (isVOP2(MI) || isVOPC(MI)) {
7259 return CreatedBB;
7260 }
7261
7262 // Legalize VOP3
7263 if (isVOP3(MI)) {
7265 return CreatedBB;
7266 }
7267
7268 // Legalize SMRD
7269 if (isSMRD(MI)) {
7271 return CreatedBB;
7272 }
7273
7274 // Legalize FLAT
7275 if (isFLAT(MI)) {
7277 return CreatedBB;
7278 }
7279
7280 // Legalize REG_SEQUENCE and PHI
7281 // The register class of the operands much be the same type as the register
7282 // class of the output.
7283 if (MI.getOpcode() == AMDGPU::PHI) {
7284 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
7285 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
7286 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
7287 continue;
7288 const TargetRegisterClass *OpRC =
7289 MRI.getRegClass(MI.getOperand(i).getReg());
7290 if (RI.hasVectorRegisters(OpRC)) {
7291 VRC = OpRC;
7292 } else {
7293 SRC = OpRC;
7294 }
7295 }
7296
7297 // If any of the operands are VGPR registers, then they all most be
7298 // otherwise we will create illegal VGPR->SGPR copies when legalizing
7299 // them.
7300 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
7301 if (!VRC) {
7302 assert(SRC);
7303 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
7304 VRC = &AMDGPU::VReg_1RegClass;
7305 } else
7306 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7307 ? RI.getEquivalentAGPRClass(SRC)
7308 : RI.getEquivalentVGPRClass(SRC);
7309 } else {
7310 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7311 ? RI.getEquivalentAGPRClass(VRC)
7312 : RI.getEquivalentVGPRClass(VRC);
7313 }
7314 RC = VRC;
7315 } else {
7316 RC = SRC;
7317 }
7318
7319 // Update all the operands so they have the same type.
7320 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7321 MachineOperand &Op = MI.getOperand(I);
7322 if (!Op.isReg() || !Op.getReg().isVirtual())
7323 continue;
7324
7325 // MI is a PHI instruction.
7326 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7328
7329 // Avoid creating no-op copies with the same src and dst reg class. These
7330 // confuse some of the machine passes.
7331 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
7332 }
7333 }
7334
7335 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7336 // VGPR dest type and SGPR sources, insert copies so all operands are
7337 // VGPRs. This seems to help operand folding / the register coalescer.
7338 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7339 MachineBasicBlock *MBB = MI.getParent();
7340 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7341 if (RI.hasVGPRs(DstRC)) {
7342 // Update all the operands so they are VGPR register classes. These may
7343 // not be the same register class because REG_SEQUENCE supports mixing
7344 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7345 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7346 MachineOperand &Op = MI.getOperand(I);
7347 if (!Op.isReg() || !Op.getReg().isVirtual())
7348 continue;
7349
7350 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7351 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7352 if (VRC == OpRC)
7353 continue;
7354
7355 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7356 Op.setIsKill();
7357 }
7358 }
7359
7360 return CreatedBB;
7361 }
7362
7363 // Legalize INSERT_SUBREG
7364 // src0 must have the same register class as dst
7365 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7366 Register Dst = MI.getOperand(0).getReg();
7367 Register Src0 = MI.getOperand(1).getReg();
7368 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7369 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7370 if (DstRC != Src0RC) {
7371 MachineBasicBlock *MBB = MI.getParent();
7372 MachineOperand &Op = MI.getOperand(1);
7373 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7374 }
7375 return CreatedBB;
7376 }
7377
7378 // Legalize SI_INIT_M0
7379 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7380 MachineOperand &Src = MI.getOperand(0);
7381 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7382 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7383 return CreatedBB;
7384 }
7385
7386 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7387 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7388 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7389 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7390 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7391 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7392 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7393 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7394 MachineOperand &Src = MI.getOperand(1);
7395 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7396 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7397 return CreatedBB;
7398 }
7399
7400 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7401 //
7402 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7403 // scratch memory access. In both cases, the legalization never involves
7404 // conversion to the addr64 form.
7406 (isMUBUF(MI) || isMTBUF(MI)))) {
7407 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7408 ? AMDGPU::OpName::rsrc
7409 : AMDGPU::OpName::srsrc;
7410 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7411 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7412 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7413
7414 AMDGPU::OpName SampOpName =
7415 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7416 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7417 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7418 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7419
7420 return CreatedBB;
7421 }
7422
7423 // Legalize SI_CALL
7424 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7425 MachineOperand *Dest = &MI.getOperand(0);
7426 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7427 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7428 // following copies, we also need to move copies from and to physical
7429 // registers into the loop block.
7430 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7431 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7432
7433 // Also move the copies to physical registers into the loop block
7434 MachineBasicBlock &MBB = *MI.getParent();
7436 while (Start->getOpcode() != FrameSetupOpcode)
7437 --Start;
7439 while (End->getOpcode() != FrameDestroyOpcode)
7440 ++End;
7441 // Also include following copies of the return value
7442 ++End;
7443 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7444 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7445 ++End;
7446 CreatedBB =
7447 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7448 }
7449 }
7450
7451 // Legalize s_sleep_var.
7452 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7453 const DebugLoc &DL = MI.getDebugLoc();
7454 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7455 int Src0Idx =
7456 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7457 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7458 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7459 .add(Src0);
7460 Src0.ChangeToRegister(Reg, false);
7461 return nullptr;
7462 }
7463
7464 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7465 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7466 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7467 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7468 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7469 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7470 for (MachineOperand &Src : MI.explicit_operands()) {
7471 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7472 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7473 }
7474 return CreatedBB;
7475 }
7476
7477 // Legalize MUBUF instructions.
7478 bool isSoffsetLegal = true;
7479 int SoffsetIdx =
7480 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7481 if (SoffsetIdx != -1) {
7482 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7483 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7484 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7485 isSoffsetLegal = false;
7486 }
7487 }
7488
7489 bool isRsrcLegal = true;
7490 int RsrcIdx =
7491 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7492 if (RsrcIdx != -1) {
7493 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7494 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7495 isRsrcLegal = false;
7496 }
7497
7498 // The operands are legal.
7499 if (isRsrcLegal && isSoffsetLegal)
7500 return CreatedBB;
7501
7502 if (!isRsrcLegal) {
7503 // Legalize a VGPR Rsrc
7504 //
7505 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7506 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7507 // a zero-value SRsrc.
7508 //
7509 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7510 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7511 // above.
7512 //
7513 // Otherwise we are on non-ADDR64 hardware, and/or we have
7514 // idxen/offen/bothen and we fall back to a waterfall loop.
7515
7516 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7517 MachineBasicBlock &MBB = *MI.getParent();
7518
7519 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7520 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7521 // This is already an ADDR64 instruction so we need to add the pointer
7522 // extracted from the resource descriptor to the current value of VAddr.
7523 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7524 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7525 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7526
7527 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7528 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7529 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7530
7531 unsigned RsrcPtr, NewSRsrc;
7532 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7533
7534 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7535 const DebugLoc &DL = MI.getDebugLoc();
7536 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7537 .addDef(CondReg0)
7538 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7539 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7540 .addImm(0);
7541
7542 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7543 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7544 .addDef(CondReg1, RegState::Dead)
7545 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7546 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7547 .addReg(CondReg0, RegState::Kill)
7548 .addImm(0);
7549
7550 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7551 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7552 .addReg(NewVAddrLo)
7553 .addImm(AMDGPU::sub0)
7554 .addReg(NewVAddrHi)
7555 .addImm(AMDGPU::sub1);
7556
7557 VAddr->setReg(NewVAddr);
7558 Rsrc->setReg(NewSRsrc);
7559 } else if (!VAddr && ST.hasAddr64()) {
7560 // This instructions is the _OFFSET variant, so we need to convert it to
7561 // ADDR64.
7562 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7563 "FIXME: Need to emit flat atomics here");
7564
7565 unsigned RsrcPtr, NewSRsrc;
7566 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7567
7568 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7569 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7570 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7571 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7572 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7573
7574 // Atomics with return have an additional tied operand and are
7575 // missing some of the special bits.
7576 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7577 MachineInstr *Addr64;
7578
7579 if (!VDataIn) {
7580 // Regular buffer load / store.
7582 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7583 .add(*VData)
7584 .addReg(NewVAddr)
7585 .addReg(NewSRsrc)
7586 .add(*SOffset)
7587 .add(*Offset);
7588
7589 if (const MachineOperand *CPol =
7590 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7591 MIB.addImm(CPol->getImm());
7592 }
7593
7594 if (const MachineOperand *TFE =
7595 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7596 MIB.addImm(TFE->getImm());
7597 }
7598
7599 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7600
7601 MIB.cloneMemRefs(MI);
7602 Addr64 = MIB;
7603 } else {
7604 // Atomics with return.
7605 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7606 .add(*VData)
7607 .add(*VDataIn)
7608 .addReg(NewVAddr)
7609 .addReg(NewSRsrc)
7610 .add(*SOffset)
7611 .add(*Offset)
7612 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7613 .cloneMemRefs(MI);
7614 }
7615
7616 MI.removeFromParent();
7617
7618 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7619 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7620 NewVAddr)
7621 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7622 .addImm(AMDGPU::sub0)
7623 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7624 .addImm(AMDGPU::sub1);
7625 } else {
7626 // Legalize a VGPR Rsrc and soffset together.
7627 if (!isSoffsetLegal) {
7628 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7629 CreatedBB =
7630 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7631 return CreatedBB;
7632 }
7633 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7634 return CreatedBB;
7635 }
7636 }
7637
7638 // Legalize a VGPR soffset.
7639 if (!isSoffsetLegal) {
7640 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7641 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7642 return CreatedBB;
7643 }
7644 return CreatedBB;
7645}
7646
7648 InstrList.insert(MI);
7649 // Add MBUF instructiosn to deferred list.
7650 int RsrcIdx =
7651 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7652 if (RsrcIdx != -1) {
7653 DeferredList.insert(MI);
7654 }
7655}
7656
7658 return DeferredList.contains(MI);
7659}
7660
7661// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7662// lowering (change spgr to vgpr).
7663// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7664// size. Need to legalize the size of the operands during the vgpr lowering
7665// chain. This can be removed after we have sgpr16 in place
7667 MachineRegisterInfo &MRI) const {
7668 if (!ST.useRealTrue16Insts())
7669 return;
7670
7671 unsigned Opcode = MI.getOpcode();
7672 MachineBasicBlock *MBB = MI.getParent();
7673 // Legalize operands and check for size mismatch
7674 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7675 OpIdx >= get(Opcode).getNumOperands() ||
7676 get(Opcode).operands()[OpIdx].RegClass == -1)
7677 return;
7678
7679 MachineOperand &Op = MI.getOperand(OpIdx);
7680 if (!Op.isReg() || !Op.getReg().isVirtual())
7681 return;
7682
7683 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7684 if (!RI.isVGPRClass(CurrRC))
7685 return;
7686
7687 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7688 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7689 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7690 Op.setSubReg(AMDGPU::lo16);
7691 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7692 const DebugLoc &DL = MI.getDebugLoc();
7693 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7694 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7695 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7696 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7697 .addReg(Op.getReg())
7698 .addImm(AMDGPU::lo16)
7699 .addReg(Undef)
7700 .addImm(AMDGPU::hi16);
7701 Op.setReg(NewDstReg);
7702 }
7703}
7705 MachineRegisterInfo &MRI) const {
7706 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7708}
7709
7711 MachineDominatorTree *MDT) const {
7712
7713 while (!Worklist.empty()) {
7714 MachineInstr &Inst = *Worklist.top();
7715 Worklist.erase_top();
7716 // Skip MachineInstr in the deferred list.
7717 if (Worklist.isDeferred(&Inst))
7718 continue;
7719 moveToVALUImpl(Worklist, MDT, Inst);
7720 }
7721
7722 // Deferred list of instructions will be processed once
7723 // all the MachineInstr in the worklist are done.
7724 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7725 moveToVALUImpl(Worklist, MDT, *Inst);
7726 assert(Worklist.empty() &&
7727 "Deferred MachineInstr are not supposed to re-populate worklist");
7728 }
7729}
7730
7733 MachineInstr &Inst) const {
7734
7736 if (!MBB)
7737 return;
7738 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7739 unsigned Opcode = Inst.getOpcode();
7740 unsigned NewOpcode = getVALUOp(Inst);
7741 const DebugLoc &DL = Inst.getDebugLoc();
7742
7743 // Handle some special cases
7744 switch (Opcode) {
7745 default:
7746 break;
7747 case AMDGPU::S_ADD_I32:
7748 case AMDGPU::S_SUB_I32: {
7749 // FIXME: The u32 versions currently selected use the carry.
7750 bool Changed;
7751 MachineBasicBlock *CreatedBBTmp = nullptr;
7752 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7753 if (Changed)
7754 return;
7755
7756 // Default handling
7757 break;
7758 }
7759
7760 case AMDGPU::S_MUL_U64:
7761 if (ST.hasVectorMulU64()) {
7762 NewOpcode = AMDGPU::V_MUL_U64_e64;
7763 break;
7764 }
7765 // Split s_mul_u64 in 32-bit vector multiplications.
7766 splitScalarSMulU64(Worklist, Inst, MDT);
7767 Inst.eraseFromParent();
7768 return;
7769
7770 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7771 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7772 // This is a special case of s_mul_u64 where all the operands are either
7773 // zero extended or sign extended.
7774 splitScalarSMulPseudo(Worklist, Inst, MDT);
7775 Inst.eraseFromParent();
7776 return;
7777
7778 case AMDGPU::S_AND_B64:
7779 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7780 Inst.eraseFromParent();
7781 return;
7782
7783 case AMDGPU::S_OR_B64:
7784 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7785 Inst.eraseFromParent();
7786 return;
7787
7788 case AMDGPU::S_XOR_B64:
7789 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7790 Inst.eraseFromParent();
7791 return;
7792
7793 case AMDGPU::S_NAND_B64:
7794 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7795 Inst.eraseFromParent();
7796 return;
7797
7798 case AMDGPU::S_NOR_B64:
7799 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7800 Inst.eraseFromParent();
7801 return;
7802
7803 case AMDGPU::S_XNOR_B64:
7804 if (ST.hasDLInsts())
7805 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7806 else
7807 splitScalar64BitXnor(Worklist, Inst, MDT);
7808 Inst.eraseFromParent();
7809 return;
7810
7811 case AMDGPU::S_ANDN2_B64:
7812 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7813 Inst.eraseFromParent();
7814 return;
7815
7816 case AMDGPU::S_ORN2_B64:
7817 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7818 Inst.eraseFromParent();
7819 return;
7820
7821 case AMDGPU::S_BREV_B64:
7822 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7823 Inst.eraseFromParent();
7824 return;
7825
7826 case AMDGPU::S_NOT_B64:
7827 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7828 Inst.eraseFromParent();
7829 return;
7830
7831 case AMDGPU::S_BCNT1_I32_B64:
7832 splitScalar64BitBCNT(Worklist, Inst);
7833 Inst.eraseFromParent();
7834 return;
7835
7836 case AMDGPU::S_BFE_I64:
7837 splitScalar64BitBFE(Worklist, Inst);
7838 Inst.eraseFromParent();
7839 return;
7840
7841 case AMDGPU::S_FLBIT_I32_B64:
7842 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7843 Inst.eraseFromParent();
7844 return;
7845 case AMDGPU::S_FF1_I32_B64:
7846 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7847 Inst.eraseFromParent();
7848 return;
7849
7850 case AMDGPU::S_LSHL_B32:
7851 if (ST.hasOnlyRevVALUShifts()) {
7852 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7853 swapOperands(Inst);
7854 }
7855 break;
7856 case AMDGPU::S_ASHR_I32:
7857 if (ST.hasOnlyRevVALUShifts()) {
7858 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7859 swapOperands(Inst);
7860 }
7861 break;
7862 case AMDGPU::S_LSHR_B32:
7863 if (ST.hasOnlyRevVALUShifts()) {
7864 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7865 swapOperands(Inst);
7866 }
7867 break;
7868 case AMDGPU::S_LSHL_B64:
7869 if (ST.hasOnlyRevVALUShifts()) {
7870 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7871 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7872 : AMDGPU::V_LSHLREV_B64_e64;
7873 swapOperands(Inst);
7874 }
7875 break;
7876 case AMDGPU::S_ASHR_I64:
7877 if (ST.hasOnlyRevVALUShifts()) {
7878 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7879 swapOperands(Inst);
7880 }
7881 break;
7882 case AMDGPU::S_LSHR_B64:
7883 if (ST.hasOnlyRevVALUShifts()) {
7884 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7885 swapOperands(Inst);
7886 }
7887 break;
7888
7889 case AMDGPU::S_ABS_I32:
7890 lowerScalarAbs(Worklist, Inst);
7891 Inst.eraseFromParent();
7892 return;
7893
7894 case AMDGPU::S_ABSDIFF_I32:
7895 lowerScalarAbsDiff(Worklist, Inst);
7896 Inst.eraseFromParent();
7897 return;
7898
7899 case AMDGPU::S_CBRANCH_SCC0:
7900 case AMDGPU::S_CBRANCH_SCC1: {
7901 // Clear unused bits of vcc
7902 Register CondReg = Inst.getOperand(1).getReg();
7903 bool IsSCC = CondReg == AMDGPU::SCC;
7905 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
7906 .addReg(LMC.ExecReg)
7907 .addReg(IsSCC ? LMC.VccReg : CondReg);
7908 Inst.removeOperand(1);
7909 } break;
7910
7911 case AMDGPU::S_BFE_U64:
7912 case AMDGPU::S_BFM_B64:
7913 llvm_unreachable("Moving this op to VALU not implemented");
7914
7915 case AMDGPU::S_PACK_LL_B32_B16:
7916 case AMDGPU::S_PACK_LH_B32_B16:
7917 case AMDGPU::S_PACK_HL_B32_B16:
7918 case AMDGPU::S_PACK_HH_B32_B16:
7919 movePackToVALU(Worklist, MRI, Inst);
7920 Inst.eraseFromParent();
7921 return;
7922
7923 case AMDGPU::S_XNOR_B32:
7924 lowerScalarXnor(Worklist, Inst);
7925 Inst.eraseFromParent();
7926 return;
7927
7928 case AMDGPU::S_NAND_B32:
7929 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7930 Inst.eraseFromParent();
7931 return;
7932
7933 case AMDGPU::S_NOR_B32:
7934 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7935 Inst.eraseFromParent();
7936 return;
7937
7938 case AMDGPU::S_ANDN2_B32:
7939 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7940 Inst.eraseFromParent();
7941 return;
7942
7943 case AMDGPU::S_ORN2_B32:
7944 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7945 Inst.eraseFromParent();
7946 return;
7947
7948 // TODO: remove as soon as everything is ready
7949 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7950 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7951 // can only be selected from the uniform SDNode.
7952 case AMDGPU::S_ADD_CO_PSEUDO:
7953 case AMDGPU::S_SUB_CO_PSEUDO: {
7954 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7955 ? AMDGPU::V_ADDC_U32_e64
7956 : AMDGPU::V_SUBB_U32_e64;
7957 const auto *CarryRC = RI.getWaveMaskRegClass();
7958
7959 Register CarryInReg = Inst.getOperand(4).getReg();
7960 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7961 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7962 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7963 .addReg(CarryInReg);
7964 }
7965
7966 Register CarryOutReg = Inst.getOperand(1).getReg();
7967
7968 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7969 MRI.getRegClass(Inst.getOperand(0).getReg())));
7970 MachineInstr *CarryOp =
7971 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7972 .addReg(CarryOutReg, RegState::Define)
7973 .add(Inst.getOperand(2))
7974 .add(Inst.getOperand(3))
7975 .addReg(CarryInReg)
7976 .addImm(0);
7977 legalizeOperands(*CarryOp);
7978 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7979 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7980 Inst.eraseFromParent();
7981 }
7982 return;
7983 case AMDGPU::S_UADDO_PSEUDO:
7984 case AMDGPU::S_USUBO_PSEUDO: {
7985 MachineOperand &Dest0 = Inst.getOperand(0);
7986 MachineOperand &Dest1 = Inst.getOperand(1);
7987 MachineOperand &Src0 = Inst.getOperand(2);
7988 MachineOperand &Src1 = Inst.getOperand(3);
7989
7990 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7991 ? AMDGPU::V_ADD_CO_U32_e64
7992 : AMDGPU::V_SUB_CO_U32_e64;
7993 const TargetRegisterClass *NewRC =
7994 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7995 Register DestReg = MRI.createVirtualRegister(NewRC);
7996 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7997 .addReg(Dest1.getReg(), RegState::Define)
7998 .add(Src0)
7999 .add(Src1)
8000 .addImm(0); // clamp bit
8001
8002 legalizeOperands(*NewInstr, MDT);
8003 MRI.replaceRegWith(Dest0.getReg(), DestReg);
8004 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8005 Inst.eraseFromParent();
8006 }
8007 return;
8008 case AMDGPU::S_LSHL1_ADD_U32:
8009 case AMDGPU::S_LSHL2_ADD_U32:
8010 case AMDGPU::S_LSHL3_ADD_U32:
8011 case AMDGPU::S_LSHL4_ADD_U32: {
8012 MachineOperand &Dest = Inst.getOperand(0);
8013 MachineOperand &Src0 = Inst.getOperand(1);
8014 MachineOperand &Src1 = Inst.getOperand(2);
8015 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8016 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8017 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8018 : 4);
8019
8020 const TargetRegisterClass *NewRC =
8021 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
8022 Register DestReg = MRI.createVirtualRegister(NewRC);
8023 MachineInstr *NewInstr =
8024 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8025 .add(Src0)
8026 .addImm(ShiftAmt)
8027 .add(Src1);
8028
8029 legalizeOperands(*NewInstr, MDT);
8030 MRI.replaceRegWith(Dest.getReg(), DestReg);
8031 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8032 Inst.eraseFromParent();
8033 }
8034 return;
8035 case AMDGPU::S_CSELECT_B32:
8036 case AMDGPU::S_CSELECT_B64:
8037 lowerSelect(Worklist, Inst, MDT);
8038 Inst.eraseFromParent();
8039 return;
8040 case AMDGPU::S_CMP_EQ_I32:
8041 case AMDGPU::S_CMP_LG_I32:
8042 case AMDGPU::S_CMP_GT_I32:
8043 case AMDGPU::S_CMP_GE_I32:
8044 case AMDGPU::S_CMP_LT_I32:
8045 case AMDGPU::S_CMP_LE_I32:
8046 case AMDGPU::S_CMP_EQ_U32:
8047 case AMDGPU::S_CMP_LG_U32:
8048 case AMDGPU::S_CMP_GT_U32:
8049 case AMDGPU::S_CMP_GE_U32:
8050 case AMDGPU::S_CMP_LT_U32:
8051 case AMDGPU::S_CMP_LE_U32:
8052 case AMDGPU::S_CMP_EQ_U64:
8053 case AMDGPU::S_CMP_LG_U64:
8054 case AMDGPU::S_CMP_LT_F32:
8055 case AMDGPU::S_CMP_EQ_F32:
8056 case AMDGPU::S_CMP_LE_F32:
8057 case AMDGPU::S_CMP_GT_F32:
8058 case AMDGPU::S_CMP_LG_F32:
8059 case AMDGPU::S_CMP_GE_F32:
8060 case AMDGPU::S_CMP_O_F32:
8061 case AMDGPU::S_CMP_U_F32:
8062 case AMDGPU::S_CMP_NGE_F32:
8063 case AMDGPU::S_CMP_NLG_F32:
8064 case AMDGPU::S_CMP_NGT_F32:
8065 case AMDGPU::S_CMP_NLE_F32:
8066 case AMDGPU::S_CMP_NEQ_F32:
8067 case AMDGPU::S_CMP_NLT_F32: {
8068 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8069 auto NewInstr =
8070 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8071 .setMIFlags(Inst.getFlags());
8072 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
8073 0) {
8074 NewInstr
8075 .addImm(0) // src0_modifiers
8076 .add(Inst.getOperand(0)) // src0
8077 .addImm(0) // src1_modifiers
8078 .add(Inst.getOperand(1)) // src1
8079 .addImm(0); // clamp
8080 } else {
8081 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
8082 }
8083 legalizeOperands(*NewInstr, MDT);
8084 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8085 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8086 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8087 Inst.eraseFromParent();
8088 return;
8089 }
8090 case AMDGPU::S_CMP_LT_F16:
8091 case AMDGPU::S_CMP_EQ_F16:
8092 case AMDGPU::S_CMP_LE_F16:
8093 case AMDGPU::S_CMP_GT_F16:
8094 case AMDGPU::S_CMP_LG_F16:
8095 case AMDGPU::S_CMP_GE_F16:
8096 case AMDGPU::S_CMP_O_F16:
8097 case AMDGPU::S_CMP_U_F16:
8098 case AMDGPU::S_CMP_NGE_F16:
8099 case AMDGPU::S_CMP_NLG_F16:
8100 case AMDGPU::S_CMP_NGT_F16:
8101 case AMDGPU::S_CMP_NLE_F16:
8102 case AMDGPU::S_CMP_NEQ_F16:
8103 case AMDGPU::S_CMP_NLT_F16: {
8104 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8105 auto NewInstr =
8106 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8107 .setMIFlags(Inst.getFlags());
8108 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8109 NewInstr
8110 .addImm(0) // src0_modifiers
8111 .add(Inst.getOperand(0)) // src0
8112 .addImm(0) // src1_modifiers
8113 .add(Inst.getOperand(1)) // src1
8114 .addImm(0); // clamp
8115 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8116 NewInstr.addImm(0); // op_sel0
8117 } else {
8118 NewInstr
8119 .add(Inst.getOperand(0))
8120 .add(Inst.getOperand(1));
8121 }
8122 legalizeOperandsVALUt16(*NewInstr, MRI);
8123 legalizeOperands(*NewInstr, MDT);
8124 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8125 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8126 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8127 Inst.eraseFromParent();
8128 return;
8129 }
8130 case AMDGPU::S_CVT_HI_F32_F16: {
8131 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8132 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8133 if (ST.useRealTrue16Insts()) {
8134 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8135 .add(Inst.getOperand(1));
8136 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8137 .addImm(0) // src0_modifiers
8138 .addReg(TmpReg, 0, AMDGPU::hi16)
8139 .addImm(0) // clamp
8140 .addImm(0) // omod
8141 .addImm(0); // op_sel0
8142 } else {
8143 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8144 .addImm(16)
8145 .add(Inst.getOperand(1));
8146 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8147 .addImm(0) // src0_modifiers
8148 .addReg(TmpReg)
8149 .addImm(0) // clamp
8150 .addImm(0); // omod
8151 }
8152
8153 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8154 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8155 Inst.eraseFromParent();
8156 return;
8157 }
8158 case AMDGPU::S_MINIMUM_F32:
8159 case AMDGPU::S_MAXIMUM_F32: {
8160 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8161 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8162 .addImm(0) // src0_modifiers
8163 .add(Inst.getOperand(1))
8164 .addImm(0) // src1_modifiers
8165 .add(Inst.getOperand(2))
8166 .addImm(0) // clamp
8167 .addImm(0); // omod
8168 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8169
8170 legalizeOperands(*NewInstr, MDT);
8171 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8172 Inst.eraseFromParent();
8173 return;
8174 }
8175 case AMDGPU::S_MINIMUM_F16:
8176 case AMDGPU::S_MAXIMUM_F16: {
8177 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8178 ? &AMDGPU::VGPR_16RegClass
8179 : &AMDGPU::VGPR_32RegClass);
8180 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8181 .addImm(0) // src0_modifiers
8182 .add(Inst.getOperand(1))
8183 .addImm(0) // src1_modifiers
8184 .add(Inst.getOperand(2))
8185 .addImm(0) // clamp
8186 .addImm(0) // omod
8187 .addImm(0); // opsel0
8188 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8189 legalizeOperandsVALUt16(*NewInstr, MRI);
8190 legalizeOperands(*NewInstr, MDT);
8191 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8192 Inst.eraseFromParent();
8193 return;
8194 }
8195 case AMDGPU::V_S_EXP_F16_e64:
8196 case AMDGPU::V_S_LOG_F16_e64:
8197 case AMDGPU::V_S_RCP_F16_e64:
8198 case AMDGPU::V_S_RSQ_F16_e64:
8199 case AMDGPU::V_S_SQRT_F16_e64: {
8200 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8201 ? &AMDGPU::VGPR_16RegClass
8202 : &AMDGPU::VGPR_32RegClass);
8203 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8204 .add(Inst.getOperand(1)) // src0_modifiers
8205 .add(Inst.getOperand(2))
8206 .add(Inst.getOperand(3)) // clamp
8207 .add(Inst.getOperand(4)) // omod
8208 .setMIFlags(Inst.getFlags());
8209 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8210 NewInstr.addImm(0); // opsel0
8211 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8212 legalizeOperandsVALUt16(*NewInstr, MRI);
8213 legalizeOperands(*NewInstr, MDT);
8214 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8215 Inst.eraseFromParent();
8216 return;
8217 }
8218 }
8219
8220 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8221 // We cannot move this instruction to the VALU, so we should try to
8222 // legalize its operands instead.
8223 legalizeOperands(Inst, MDT);
8224 return;
8225 }
8226 // Handle converting generic instructions like COPY-to-SGPR into
8227 // COPY-to-VGPR.
8228 if (NewOpcode == Opcode) {
8229 Register DstReg = Inst.getOperand(0).getReg();
8230 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8231
8232 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8233 // hope for the best.
8234 if (Inst.isCopy() && DstReg.isPhysical() &&
8235 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8236 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8237 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8238 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8239 .add(Inst.getOperand(1));
8240 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8241 DstReg)
8242 .addReg(NewDst);
8243
8244 Inst.eraseFromParent();
8245 return;
8246 }
8247
8248 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
8249 Register NewDstReg = Inst.getOperand(1).getReg();
8250 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8251 if (const TargetRegisterClass *CommonRC =
8252 RI.getCommonSubClass(NewDstRC, SrcRC)) {
8253 // Instead of creating a copy where src and dst are the same register
8254 // class, we just replace all uses of dst with src. These kinds of
8255 // copies interfere with the heuristics MachineSink uses to decide
8256 // whether or not to split a critical edge. Since the pass assumes
8257 // that copies will end up as machine instructions and not be
8258 // eliminated.
8259 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8260 MRI.replaceRegWith(DstReg, NewDstReg);
8261 MRI.clearKillFlags(NewDstReg);
8262 Inst.getOperand(0).setReg(DstReg);
8263
8264 if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8265 llvm_unreachable("failed to constrain register");
8266
8267 Inst.eraseFromParent();
8268 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8269 for (MachineOperand &MO :
8270 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8271 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8272 }
8273
8274 return;
8275 }
8276 }
8277
8278 // If this is a v2s copy between 16bit and 32bit reg,
8279 // replace vgpr copy to reg_sequence/extract_subreg
8280 // This can be remove after we have sgpr16 in place
8281 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8282 Inst.getOperand(1).getReg().isVirtual() &&
8283 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8284 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8285 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8286 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8287 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8288 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8289 get(AMDGPU::IMPLICIT_DEF), Undef);
8290 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8291 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8292 .addReg(Inst.getOperand(1).getReg())
8293 .addImm(AMDGPU::lo16)
8294 .addReg(Undef)
8295 .addImm(AMDGPU::hi16);
8296 Inst.eraseFromParent();
8297 MRI.replaceRegWith(DstReg, NewDstReg);
8298 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8299 return;
8300 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8301 AMDGPU::lo16)) {
8302 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8303 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8304 MRI.replaceRegWith(DstReg, NewDstReg);
8305 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8306 return;
8307 }
8308 }
8309
8310 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8311 MRI.replaceRegWith(DstReg, NewDstReg);
8312 legalizeOperands(Inst, MDT);
8313 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8314 return;
8315 }
8316
8317 // Use the new VALU Opcode.
8318 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8319 .setMIFlags(Inst.getFlags());
8320 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8321 // Intersperse VOP3 modifiers among the SALU operands.
8322 NewInstr->addOperand(Inst.getOperand(0));
8323 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8324 AMDGPU::OpName::src0_modifiers) >= 0)
8325 NewInstr.addImm(0);
8326 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8327 const MachineOperand &Src = Inst.getOperand(1);
8328 NewInstr->addOperand(Src);
8329 }
8330
8331 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8332 // We are converting these to a BFE, so we need to add the missing
8333 // operands for the size and offset.
8334 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8335 NewInstr.addImm(0);
8336 NewInstr.addImm(Size);
8337 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8338 // The VALU version adds the second operand to the result, so insert an
8339 // extra 0 operand.
8340 NewInstr.addImm(0);
8341 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8342 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8343 // If we need to move this to VGPRs, we need to unpack the second
8344 // operand back into the 2 separate ones for bit offset and width.
8345 assert(OffsetWidthOp.isImm() &&
8346 "Scalar BFE is only implemented for constant width and offset");
8347 uint32_t Imm = OffsetWidthOp.getImm();
8348
8349 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8350 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8351 NewInstr.addImm(Offset);
8352 NewInstr.addImm(BitWidth);
8353 } else {
8354 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8355 AMDGPU::OpName::src1_modifiers) >= 0)
8356 NewInstr.addImm(0);
8357 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8358 NewInstr->addOperand(Inst.getOperand(2));
8359 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8360 AMDGPU::OpName::src2_modifiers) >= 0)
8361 NewInstr.addImm(0);
8362 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8363 NewInstr->addOperand(Inst.getOperand(3));
8364 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8365 NewInstr.addImm(0);
8366 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8367 NewInstr.addImm(0);
8368 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8369 NewInstr.addImm(0);
8370 }
8371 } else {
8372 // Just copy the SALU operands.
8373 for (const MachineOperand &Op : Inst.explicit_operands())
8374 NewInstr->addOperand(Op);
8375 }
8376
8377 // Remove any references to SCC. Vector instructions can't read from it, and
8378 // We're just about to add the implicit use / defs of VCC, and we don't want
8379 // both.
8380 for (MachineOperand &Op : Inst.implicit_operands()) {
8381 if (Op.getReg() == AMDGPU::SCC) {
8382 // Only propagate through live-def of SCC.
8383 if (Op.isDef() && !Op.isDead())
8384 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8385 if (Op.isUse())
8386 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8387 }
8388 }
8389 Inst.eraseFromParent();
8390 Register NewDstReg;
8391 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8392 Register DstReg = NewInstr->getOperand(0).getReg();
8393 assert(DstReg.isVirtual());
8394 // Update the destination register class.
8395 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8396 assert(NewDstRC);
8397 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8398 MRI.replaceRegWith(DstReg, NewDstReg);
8399 }
8400 fixImplicitOperands(*NewInstr);
8401
8402 legalizeOperandsVALUt16(*NewInstr, MRI);
8403
8404 // Legalize the operands
8405 legalizeOperands(*NewInstr, MDT);
8406 if (NewDstReg)
8407 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8408}
8409
8410// Add/sub require special handling to deal with carry outs.
8411std::pair<bool, MachineBasicBlock *>
8412SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8413 MachineDominatorTree *MDT) const {
8414 if (ST.hasAddNoCarry()) {
8415 // Assume there is no user of scc since we don't select this in that case.
8416 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8417 // is used.
8418
8419 MachineBasicBlock &MBB = *Inst.getParent();
8420 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8421
8422 Register OldDstReg = Inst.getOperand(0).getReg();
8423 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8424
8425 unsigned Opc = Inst.getOpcode();
8426 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8427
8428 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8429 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8430
8431 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8432 Inst.removeOperand(3);
8433
8434 Inst.setDesc(get(NewOpc));
8435 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8436 Inst.addImplicitDefUseOperands(*MBB.getParent());
8437 MRI.replaceRegWith(OldDstReg, ResultReg);
8438 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8439
8440 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8441 return std::pair(true, NewBB);
8442 }
8443
8444 return std::pair(false, nullptr);
8445}
8446
8447void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8448 MachineDominatorTree *MDT) const {
8449
8450 MachineBasicBlock &MBB = *Inst.getParent();
8451 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8452 MachineBasicBlock::iterator MII = Inst;
8453 DebugLoc DL = Inst.getDebugLoc();
8454
8455 MachineOperand &Dest = Inst.getOperand(0);
8456 MachineOperand &Src0 = Inst.getOperand(1);
8457 MachineOperand &Src1 = Inst.getOperand(2);
8458 MachineOperand &Cond = Inst.getOperand(3);
8459
8460 Register CondReg = Cond.getReg();
8461 bool IsSCC = (CondReg == AMDGPU::SCC);
8462
8463 // If this is a trivial select where the condition is effectively not SCC
8464 // (CondReg is a source of copy to SCC), then the select is semantically
8465 // equivalent to copying CondReg. Hence, there is no need to create
8466 // V_CNDMASK, we can just use that and bail out.
8467 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8468 (Src1.getImm() == 0)) {
8469 MRI.replaceRegWith(Dest.getReg(), CondReg);
8470 return;
8471 }
8472
8473 Register NewCondReg = CondReg;
8474 if (IsSCC) {
8475 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8476 NewCondReg = MRI.createVirtualRegister(TC);
8477
8478 // Now look for the closest SCC def if it is a copy
8479 // replacing the CondReg with the COPY source register
8480 bool CopyFound = false;
8481 for (MachineInstr &CandI :
8483 Inst.getParent()->rend())) {
8484 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8485 -1) {
8486 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8487 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8488 .addReg(CandI.getOperand(1).getReg());
8489 CopyFound = true;
8490 }
8491 break;
8492 }
8493 }
8494 if (!CopyFound) {
8495 // SCC def is not a copy
8496 // Insert a trivial select instead of creating a copy, because a copy from
8497 // SCC would semantically mean just copying a single bit, but we may need
8498 // the result to be a vector condition mask that needs preserving.
8499 unsigned Opcode =
8500 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8501 auto NewSelect =
8502 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8503 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8504 }
8505 }
8506
8507 Register NewDestReg = MRI.createVirtualRegister(
8508 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8509 MachineInstr *NewInst;
8510 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8511 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8512 .addImm(0)
8513 .add(Src1) // False
8514 .addImm(0)
8515 .add(Src0) // True
8516 .addReg(NewCondReg);
8517 } else {
8518 NewInst =
8519 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8520 .add(Src1) // False
8521 .add(Src0) // True
8522 .addReg(NewCondReg);
8523 }
8524 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8525 legalizeOperands(*NewInst, MDT);
8526 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8527}
8528
8529void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8530 MachineInstr &Inst) const {
8531 MachineBasicBlock &MBB = *Inst.getParent();
8532 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8533 MachineBasicBlock::iterator MII = Inst;
8534 DebugLoc DL = Inst.getDebugLoc();
8535
8536 MachineOperand &Dest = Inst.getOperand(0);
8537 MachineOperand &Src = Inst.getOperand(1);
8538 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8539 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8540
8541 unsigned SubOp = ST.hasAddNoCarry() ?
8542 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8543
8544 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8545 .addImm(0)
8546 .addReg(Src.getReg());
8547
8548 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8549 .addReg(Src.getReg())
8550 .addReg(TmpReg);
8551
8552 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8553 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8554}
8555
8556void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8557 MachineInstr &Inst) const {
8558 MachineBasicBlock &MBB = *Inst.getParent();
8559 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8560 MachineBasicBlock::iterator MII = Inst;
8561 const DebugLoc &DL = Inst.getDebugLoc();
8562
8563 MachineOperand &Dest = Inst.getOperand(0);
8564 MachineOperand &Src1 = Inst.getOperand(1);
8565 MachineOperand &Src2 = Inst.getOperand(2);
8566 Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8567 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8568 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8569
8570 unsigned SubOp =
8571 ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8572
8573 BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8574 .addReg(Src1.getReg())
8575 .addReg(Src2.getReg());
8576
8577 BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8578
8579 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8580 .addReg(SubResultReg)
8581 .addReg(TmpReg);
8582
8583 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8584 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8585}
8586
8587void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8588 MachineInstr &Inst) const {
8589 MachineBasicBlock &MBB = *Inst.getParent();
8590 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8591 MachineBasicBlock::iterator MII = Inst;
8592 const DebugLoc &DL = Inst.getDebugLoc();
8593
8594 MachineOperand &Dest = Inst.getOperand(0);
8595 MachineOperand &Src0 = Inst.getOperand(1);
8596 MachineOperand &Src1 = Inst.getOperand(2);
8597
8598 if (ST.hasDLInsts()) {
8599 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8600 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8601 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8602
8603 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8604 .add(Src0)
8605 .add(Src1);
8606
8607 MRI.replaceRegWith(Dest.getReg(), NewDest);
8608 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8609 } else {
8610 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8611 // invert either source and then perform the XOR. If either source is a
8612 // scalar register, then we can leave the inversion on the scalar unit to
8613 // achieve a better distribution of scalar and vector instructions.
8614 bool Src0IsSGPR = Src0.isReg() &&
8615 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8616 bool Src1IsSGPR = Src1.isReg() &&
8617 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8618 MachineInstr *Xor;
8619 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8620 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8621
8622 // Build a pair of scalar instructions and add them to the work list.
8623 // The next iteration over the work list will lower these to the vector
8624 // unit as necessary.
8625 if (Src0IsSGPR) {
8626 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8627 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8628 .addReg(Temp)
8629 .add(Src1);
8630 } else if (Src1IsSGPR) {
8631 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8632 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8633 .add(Src0)
8634 .addReg(Temp);
8635 } else {
8636 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8637 .add(Src0)
8638 .add(Src1);
8639 MachineInstr *Not =
8640 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8641 Worklist.insert(Not);
8642 }
8643
8644 MRI.replaceRegWith(Dest.getReg(), NewDest);
8645
8646 Worklist.insert(Xor);
8647
8648 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8649 }
8650}
8651
8652void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8653 MachineInstr &Inst,
8654 unsigned Opcode) const {
8655 MachineBasicBlock &MBB = *Inst.getParent();
8656 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8657 MachineBasicBlock::iterator MII = Inst;
8658 const DebugLoc &DL = Inst.getDebugLoc();
8659
8660 MachineOperand &Dest = Inst.getOperand(0);
8661 MachineOperand &Src0 = Inst.getOperand(1);
8662 MachineOperand &Src1 = Inst.getOperand(2);
8663
8664 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8665 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8666
8667 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8668 .add(Src0)
8669 .add(Src1);
8670
8671 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8672 .addReg(Interm);
8673
8674 Worklist.insert(&Op);
8675 Worklist.insert(&Not);
8676
8677 MRI.replaceRegWith(Dest.getReg(), NewDest);
8678 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8679}
8680
8681void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8682 MachineInstr &Inst,
8683 unsigned Opcode) const {
8684 MachineBasicBlock &MBB = *Inst.getParent();
8685 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8686 MachineBasicBlock::iterator MII = Inst;
8687 const DebugLoc &DL = Inst.getDebugLoc();
8688
8689 MachineOperand &Dest = Inst.getOperand(0);
8690 MachineOperand &Src0 = Inst.getOperand(1);
8691 MachineOperand &Src1 = Inst.getOperand(2);
8692
8693 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8694 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8695
8696 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8697 .add(Src1);
8698
8699 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8700 .add(Src0)
8701 .addReg(Interm);
8702
8703 Worklist.insert(&Not);
8704 Worklist.insert(&Op);
8705
8706 MRI.replaceRegWith(Dest.getReg(), NewDest);
8707 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8708}
8709
8710void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8711 MachineInstr &Inst, unsigned Opcode,
8712 bool Swap) const {
8713 MachineBasicBlock &MBB = *Inst.getParent();
8714 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8715
8716 MachineOperand &Dest = Inst.getOperand(0);
8717 MachineOperand &Src0 = Inst.getOperand(1);
8718 DebugLoc DL = Inst.getDebugLoc();
8719
8720 MachineBasicBlock::iterator MII = Inst;
8721
8722 const MCInstrDesc &InstDesc = get(Opcode);
8723 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8724 MRI.getRegClass(Src0.getReg()) :
8725 &AMDGPU::SGPR_32RegClass;
8726
8727 const TargetRegisterClass *Src0SubRC =
8728 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8729
8730 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8731 AMDGPU::sub0, Src0SubRC);
8732
8733 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8734 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8735 const TargetRegisterClass *NewDestSubRC =
8736 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8737
8738 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8739 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8740
8741 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8742 AMDGPU::sub1, Src0SubRC);
8743
8744 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8745 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8746
8747 if (Swap)
8748 std::swap(DestSub0, DestSub1);
8749
8750 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8751 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8752 .addReg(DestSub0)
8753 .addImm(AMDGPU::sub0)
8754 .addReg(DestSub1)
8755 .addImm(AMDGPU::sub1);
8756
8757 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8758
8759 Worklist.insert(&LoHalf);
8760 Worklist.insert(&HiHalf);
8761
8762 // We don't need to legalizeOperands here because for a single operand, src0
8763 // will support any kind of input.
8764
8765 // Move all users of this moved value.
8766 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8767}
8768
8769// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8770// split the s_mul_u64 in 32-bit vector multiplications.
8771void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8772 MachineInstr &Inst,
8773 MachineDominatorTree *MDT) const {
8774 MachineBasicBlock &MBB = *Inst.getParent();
8775 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8776
8777 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8778 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8779 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8780
8781 MachineOperand &Dest = Inst.getOperand(0);
8782 MachineOperand &Src0 = Inst.getOperand(1);
8783 MachineOperand &Src1 = Inst.getOperand(2);
8784 const DebugLoc &DL = Inst.getDebugLoc();
8785 MachineBasicBlock::iterator MII = Inst;
8786
8787 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8788 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8789 const TargetRegisterClass *Src0SubRC =
8790 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8791 if (RI.isSGPRClass(Src0SubRC))
8792 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8793 const TargetRegisterClass *Src1SubRC =
8794 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8795 if (RI.isSGPRClass(Src1SubRC))
8796 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8797
8798 // First, we extract the low 32-bit and high 32-bit values from each of the
8799 // operands.
8800 MachineOperand Op0L =
8801 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8802 MachineOperand Op1L =
8803 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8804 MachineOperand Op0H =
8805 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8806 MachineOperand Op1H =
8807 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8808
8809 // The multilication is done as follows:
8810 //
8811 // Op1H Op1L
8812 // * Op0H Op0L
8813 // --------------------
8814 // Op1H*Op0L Op1L*Op0L
8815 // + Op1H*Op0H Op1L*Op0H
8816 // -----------------------------------------
8817 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8818 //
8819 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8820 // value and that would overflow.
8821 // The low 32-bit value is Op1L*Op0L.
8822 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8823
8824 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8825 MachineInstr *Op1L_Op0H =
8826 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8827 .add(Op1L)
8828 .add(Op0H);
8829
8830 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8831 MachineInstr *Op1H_Op0L =
8832 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8833 .add(Op1H)
8834 .add(Op0L);
8835
8836 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8837 MachineInstr *Carry =
8838 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8839 .add(Op1L)
8840 .add(Op0L);
8841
8842 MachineInstr *LoHalf =
8843 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8844 .add(Op1L)
8845 .add(Op0L);
8846
8847 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8848 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8849 .addReg(Op1L_Op0H_Reg)
8850 .addReg(Op1H_Op0L_Reg);
8851
8852 MachineInstr *HiHalf =
8853 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8854 .addReg(AddReg)
8855 .addReg(CarryReg);
8856
8857 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8858 .addReg(DestSub0)
8859 .addImm(AMDGPU::sub0)
8860 .addReg(DestSub1)
8861 .addImm(AMDGPU::sub1);
8862
8863 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8864
8865 // Try to legalize the operands in case we need to swap the order to keep it
8866 // valid.
8867 legalizeOperands(*Op1L_Op0H, MDT);
8868 legalizeOperands(*Op1H_Op0L, MDT);
8869 legalizeOperands(*Carry, MDT);
8870 legalizeOperands(*LoHalf, MDT);
8871 legalizeOperands(*Add, MDT);
8872 legalizeOperands(*HiHalf, MDT);
8873
8874 // Move all users of this moved value.
8875 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8876}
8877
8878// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8879// multiplications.
8880void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8881 MachineInstr &Inst,
8882 MachineDominatorTree *MDT) const {
8883 MachineBasicBlock &MBB = *Inst.getParent();
8884 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8885
8886 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8887 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8888 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8889
8890 MachineOperand &Dest = Inst.getOperand(0);
8891 MachineOperand &Src0 = Inst.getOperand(1);
8892 MachineOperand &Src1 = Inst.getOperand(2);
8893 const DebugLoc &DL = Inst.getDebugLoc();
8894 MachineBasicBlock::iterator MII = Inst;
8895
8896 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8897 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8898 const TargetRegisterClass *Src0SubRC =
8899 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8900 if (RI.isSGPRClass(Src0SubRC))
8901 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8902 const TargetRegisterClass *Src1SubRC =
8903 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8904 if (RI.isSGPRClass(Src1SubRC))
8905 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8906
8907 // First, we extract the low 32-bit and high 32-bit values from each of the
8908 // operands.
8909 MachineOperand Op0L =
8910 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8911 MachineOperand Op1L =
8912 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8913
8914 unsigned Opc = Inst.getOpcode();
8915 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8916 ? AMDGPU::V_MUL_HI_U32_e64
8917 : AMDGPU::V_MUL_HI_I32_e64;
8918 MachineInstr *HiHalf =
8919 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8920
8921 MachineInstr *LoHalf =
8922 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8923 .add(Op1L)
8924 .add(Op0L);
8925
8926 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8927 .addReg(DestSub0)
8928 .addImm(AMDGPU::sub0)
8929 .addReg(DestSub1)
8930 .addImm(AMDGPU::sub1);
8931
8932 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8933
8934 // Try to legalize the operands in case we need to swap the order to keep it
8935 // valid.
8936 legalizeOperands(*HiHalf, MDT);
8937 legalizeOperands(*LoHalf, MDT);
8938
8939 // Move all users of this moved value.
8940 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8941}
8942
8943void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8944 MachineInstr &Inst, unsigned Opcode,
8945 MachineDominatorTree *MDT) const {
8946 MachineBasicBlock &MBB = *Inst.getParent();
8947 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8948
8949 MachineOperand &Dest = Inst.getOperand(0);
8950 MachineOperand &Src0 = Inst.getOperand(1);
8951 MachineOperand &Src1 = Inst.getOperand(2);
8952 DebugLoc DL = Inst.getDebugLoc();
8953
8954 MachineBasicBlock::iterator MII = Inst;
8955
8956 const MCInstrDesc &InstDesc = get(Opcode);
8957 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8958 MRI.getRegClass(Src0.getReg()) :
8959 &AMDGPU::SGPR_32RegClass;
8960
8961 const TargetRegisterClass *Src0SubRC =
8962 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8963 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8964 MRI.getRegClass(Src1.getReg()) :
8965 &AMDGPU::SGPR_32RegClass;
8966
8967 const TargetRegisterClass *Src1SubRC =
8968 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8969
8970 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8971 AMDGPU::sub0, Src0SubRC);
8972 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8973 AMDGPU::sub0, Src1SubRC);
8974 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8975 AMDGPU::sub1, Src0SubRC);
8976 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8977 AMDGPU::sub1, Src1SubRC);
8978
8979 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8980 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8981 const TargetRegisterClass *NewDestSubRC =
8982 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8983
8984 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8985 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8986 .add(SrcReg0Sub0)
8987 .add(SrcReg1Sub0);
8988
8989 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8990 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8991 .add(SrcReg0Sub1)
8992 .add(SrcReg1Sub1);
8993
8994 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8995 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8996 .addReg(DestSub0)
8997 .addImm(AMDGPU::sub0)
8998 .addReg(DestSub1)
8999 .addImm(AMDGPU::sub1);
9000
9001 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9002
9003 Worklist.insert(&LoHalf);
9004 Worklist.insert(&HiHalf);
9005
9006 // Move all users of this moved value.
9007 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9008}
9009
9010void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9011 MachineInstr &Inst,
9012 MachineDominatorTree *MDT) const {
9013 MachineBasicBlock &MBB = *Inst.getParent();
9014 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9015
9016 MachineOperand &Dest = Inst.getOperand(0);
9017 MachineOperand &Src0 = Inst.getOperand(1);
9018 MachineOperand &Src1 = Inst.getOperand(2);
9019 const DebugLoc &DL = Inst.getDebugLoc();
9020
9021 MachineBasicBlock::iterator MII = Inst;
9022
9023 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9024
9025 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
9026
9027 MachineOperand* Op0;
9028 MachineOperand* Op1;
9029
9030 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
9031 Op0 = &Src0;
9032 Op1 = &Src1;
9033 } else {
9034 Op0 = &Src1;
9035 Op1 = &Src0;
9036 }
9037
9038 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
9039 .add(*Op0);
9040
9041 Register NewDest = MRI.createVirtualRegister(DestRC);
9042
9043 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
9044 .addReg(Interm)
9045 .add(*Op1);
9046
9047 MRI.replaceRegWith(Dest.getReg(), NewDest);
9048
9049 Worklist.insert(&Xor);
9050}
9051
9052void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9053 MachineInstr &Inst) const {
9054 MachineBasicBlock &MBB = *Inst.getParent();
9055 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9056
9057 MachineBasicBlock::iterator MII = Inst;
9058 const DebugLoc &DL = Inst.getDebugLoc();
9059
9060 MachineOperand &Dest = Inst.getOperand(0);
9061 MachineOperand &Src = Inst.getOperand(1);
9062
9063 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
9064 const TargetRegisterClass *SrcRC = Src.isReg() ?
9065 MRI.getRegClass(Src.getReg()) :
9066 &AMDGPU::SGPR_32RegClass;
9067
9068 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9069 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9070
9071 const TargetRegisterClass *SrcSubRC =
9072 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9073
9074 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9075 AMDGPU::sub0, SrcSubRC);
9076 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9077 AMDGPU::sub1, SrcSubRC);
9078
9079 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
9080
9081 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
9082
9083 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9084
9085 // We don't need to legalize operands here. src0 for either instruction can be
9086 // an SGPR, and the second input is unused or determined here.
9087 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9088}
9089
9090void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9091 MachineInstr &Inst) const {
9092 MachineBasicBlock &MBB = *Inst.getParent();
9093 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9094 MachineBasicBlock::iterator MII = Inst;
9095 const DebugLoc &DL = Inst.getDebugLoc();
9096
9097 MachineOperand &Dest = Inst.getOperand(0);
9098 uint32_t Imm = Inst.getOperand(2).getImm();
9099 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9100 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9101
9102 (void) Offset;
9103
9104 // Only sext_inreg cases handled.
9105 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9106 Offset == 0 && "Not implemented");
9107
9108 if (BitWidth < 32) {
9109 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9110 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9111 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9112
9113 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
9114 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
9115 .addImm(0)
9116 .addImm(BitWidth);
9117
9118 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
9119 .addImm(31)
9120 .addReg(MidRegLo);
9121
9122 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9123 .addReg(MidRegLo)
9124 .addImm(AMDGPU::sub0)
9125 .addReg(MidRegHi)
9126 .addImm(AMDGPU::sub1);
9127
9128 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9129 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9130 return;
9131 }
9132
9133 MachineOperand &Src = Inst.getOperand(1);
9134 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9135 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9136
9137 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9138 .addImm(31)
9139 .addReg(Src.getReg(), 0, AMDGPU::sub0);
9140
9141 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9142 .addReg(Src.getReg(), 0, AMDGPU::sub0)
9143 .addImm(AMDGPU::sub0)
9144 .addReg(TmpReg)
9145 .addImm(AMDGPU::sub1);
9146
9147 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9148 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9149}
9150
9151void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9152 MachineInstr &Inst, unsigned Opcode,
9153 MachineDominatorTree *MDT) const {
9154 // (S_FLBIT_I32_B64 hi:lo) ->
9155 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9156 // (S_FF1_I32_B64 hi:lo) ->
9157 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9158
9159 MachineBasicBlock &MBB = *Inst.getParent();
9160 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9161 MachineBasicBlock::iterator MII = Inst;
9162 const DebugLoc &DL = Inst.getDebugLoc();
9163
9164 MachineOperand &Dest = Inst.getOperand(0);
9165 MachineOperand &Src = Inst.getOperand(1);
9166
9167 const MCInstrDesc &InstDesc = get(Opcode);
9168
9169 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9170 unsigned OpcodeAdd =
9171 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
9172
9173 const TargetRegisterClass *SrcRC =
9174 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9175 const TargetRegisterClass *SrcSubRC =
9176 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9177
9178 MachineOperand SrcRegSub0 =
9179 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9180 MachineOperand SrcRegSub1 =
9181 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9182
9183 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9184 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9185 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9186 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9187
9188 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9189
9190 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9191
9192 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9193 .addReg(IsCtlz ? MidReg1 : MidReg2)
9194 .addImm(32)
9195 .addImm(1); // enable clamp
9196
9197 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9198 .addReg(MidReg3)
9199 .addReg(IsCtlz ? MidReg2 : MidReg1);
9200
9201 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9202
9203 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9204}
9205
9206void SIInstrInfo::addUsersToMoveToVALUWorklist(
9208 SIInstrWorklist &Worklist) const {
9209 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9210 MachineInstr &UseMI = *MO.getParent();
9211
9212 unsigned OpNo = 0;
9213
9214 switch (UseMI.getOpcode()) {
9215 case AMDGPU::COPY:
9216 case AMDGPU::WQM:
9217 case AMDGPU::SOFT_WQM:
9218 case AMDGPU::STRICT_WWM:
9219 case AMDGPU::STRICT_WQM:
9220 case AMDGPU::REG_SEQUENCE:
9221 case AMDGPU::PHI:
9222 case AMDGPU::INSERT_SUBREG:
9223 break;
9224 default:
9225 OpNo = MO.getOperandNo();
9226 break;
9227 }
9228
9229 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9230 MRI.constrainRegClass(DstReg, OpRC);
9231
9232 if (!RI.hasVectorRegisters(OpRC))
9233 Worklist.insert(&UseMI);
9234 else
9235 // Legalization could change user list.
9237 }
9238}
9239
9240void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9242 MachineInstr &Inst) const {
9243 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9244 MachineBasicBlock *MBB = Inst.getParent();
9245 MachineOperand &Src0 = Inst.getOperand(1);
9246 MachineOperand &Src1 = Inst.getOperand(2);
9247 const DebugLoc &DL = Inst.getDebugLoc();
9248
9249 if (ST.useRealTrue16Insts()) {
9250 Register SrcReg0, SrcReg1;
9251 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9252 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9253 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
9254 } else {
9255 SrcReg0 = Src0.getReg();
9256 }
9257
9258 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9259 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9260 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
9261 } else {
9262 SrcReg1 = Src1.getReg();
9263 }
9264
9265 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9266 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9267
9268 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9269 switch (Inst.getOpcode()) {
9270 case AMDGPU::S_PACK_LL_B32_B16:
9271 NewMI
9272 .addReg(SrcReg0, 0,
9273 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9274 .addImm(AMDGPU::lo16)
9275 .addReg(SrcReg1, 0,
9276 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9277 .addImm(AMDGPU::hi16);
9278 break;
9279 case AMDGPU::S_PACK_LH_B32_B16:
9280 NewMI
9281 .addReg(SrcReg0, 0,
9282 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9283 .addImm(AMDGPU::lo16)
9284 .addReg(SrcReg1, 0, AMDGPU::hi16)
9285 .addImm(AMDGPU::hi16);
9286 break;
9287 case AMDGPU::S_PACK_HL_B32_B16:
9288 NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9289 .addImm(AMDGPU::lo16)
9290 .addReg(SrcReg1, 0,
9291 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9292 .addImm(AMDGPU::hi16);
9293 break;
9294 case AMDGPU::S_PACK_HH_B32_B16:
9295 NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9296 .addImm(AMDGPU::lo16)
9297 .addReg(SrcReg1, 0, AMDGPU::hi16)
9298 .addImm(AMDGPU::hi16);
9299 break;
9300 default:
9301 llvm_unreachable("unhandled s_pack_* instruction");
9302 }
9303
9304 MachineOperand &Dest = Inst.getOperand(0);
9305 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9306 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9307 return;
9308 }
9309
9310 switch (Inst.getOpcode()) {
9311 case AMDGPU::S_PACK_LL_B32_B16: {
9312 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9313 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9314
9315 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9316 // 0.
9317 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9318 .addImm(0xffff);
9319
9320 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9321 .addReg(ImmReg, RegState::Kill)
9322 .add(Src0);
9323
9324 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9325 .add(Src1)
9326 .addImm(16)
9327 .addReg(TmpReg, RegState::Kill);
9328 break;
9329 }
9330 case AMDGPU::S_PACK_LH_B32_B16: {
9331 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9332 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9333 .addImm(0xffff);
9334 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9335 .addReg(ImmReg, RegState::Kill)
9336 .add(Src0)
9337 .add(Src1);
9338 break;
9339 }
9340 case AMDGPU::S_PACK_HL_B32_B16: {
9341 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9342 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9343 .addImm(16)
9344 .add(Src0);
9345 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9346 .add(Src1)
9347 .addImm(16)
9348 .addReg(TmpReg, RegState::Kill);
9349 break;
9350 }
9351 case AMDGPU::S_PACK_HH_B32_B16: {
9352 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9353 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9354 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9355 .addImm(16)
9356 .add(Src0);
9357 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9358 .addImm(0xffff0000);
9359 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9360 .add(Src1)
9361 .addReg(ImmReg, RegState::Kill)
9362 .addReg(TmpReg, RegState::Kill);
9363 break;
9364 }
9365 default:
9366 llvm_unreachable("unhandled s_pack_* instruction");
9367 }
9368
9369 MachineOperand &Dest = Inst.getOperand(0);
9370 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9371 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9372}
9373
9374void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9375 MachineInstr &SCCDefInst,
9376 SIInstrWorklist &Worklist,
9377 Register NewCond) const {
9378
9379 // Ensure that def inst defines SCC, which is still live.
9380 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9381 !Op.isDead() && Op.getParent() == &SCCDefInst);
9382 SmallVector<MachineInstr *, 4> CopyToDelete;
9383 // This assumes that all the users of SCC are in the same block
9384 // as the SCC def.
9385 for (MachineInstr &MI : // Skip the def inst itself.
9386 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9387 SCCDefInst.getParent()->end())) {
9388 // Check if SCC is used first.
9389 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9390 if (SCCIdx != -1) {
9391 if (MI.isCopy()) {
9392 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9393 Register DestReg = MI.getOperand(0).getReg();
9394
9395 MRI.replaceRegWith(DestReg, NewCond);
9396 CopyToDelete.push_back(&MI);
9397 } else {
9398
9399 if (NewCond.isValid())
9400 MI.getOperand(SCCIdx).setReg(NewCond);
9401
9402 Worklist.insert(&MI);
9403 }
9404 }
9405 // Exit if we find another SCC def.
9406 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9407 break;
9408 }
9409 for (auto &Copy : CopyToDelete)
9410 Copy->eraseFromParent();
9411}
9412
9413// Instructions that use SCC may be converted to VALU instructions. When that
9414// happens, the SCC register is changed to VCC_LO. The instruction that defines
9415// SCC must be changed to an instruction that defines VCC. This function makes
9416// sure that the instruction that defines SCC is added to the moveToVALU
9417// worklist.
9418void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9419 SIInstrWorklist &Worklist) const {
9420 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9421 // then there is nothing to do because the defining instruction has been
9422 // converted to a VALU already. If SCC then that instruction needs to be
9423 // converted to a VALU.
9424 for (MachineInstr &MI :
9425 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9426 SCCUseInst->getParent()->rend())) {
9427 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9428 break;
9429 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9430 Worklist.insert(&MI);
9431 break;
9432 }
9433 }
9434}
9435
9436const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9437 const MachineInstr &Inst) const {
9438 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9439
9440 switch (Inst.getOpcode()) {
9441 // For target instructions, getOpRegClass just returns the virtual register
9442 // class associated with the operand, so we need to find an equivalent VGPR
9443 // register class in order to move the instruction to the VALU.
9444 case AMDGPU::COPY:
9445 case AMDGPU::PHI:
9446 case AMDGPU::REG_SEQUENCE:
9447 case AMDGPU::INSERT_SUBREG:
9448 case AMDGPU::WQM:
9449 case AMDGPU::SOFT_WQM:
9450 case AMDGPU::STRICT_WWM:
9451 case AMDGPU::STRICT_WQM: {
9452 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9453 if (RI.isAGPRClass(SrcRC)) {
9454 if (RI.isAGPRClass(NewDstRC))
9455 return nullptr;
9456
9457 switch (Inst.getOpcode()) {
9458 case AMDGPU::PHI:
9459 case AMDGPU::REG_SEQUENCE:
9460 case AMDGPU::INSERT_SUBREG:
9461 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9462 break;
9463 default:
9464 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9465 }
9466
9467 if (!NewDstRC)
9468 return nullptr;
9469 } else {
9470 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9471 return nullptr;
9472
9473 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9474 if (!NewDstRC)
9475 return nullptr;
9476 }
9477
9478 return NewDstRC;
9479 }
9480 default:
9481 return NewDstRC;
9482 }
9483}
9484
9485// Find the one SGPR operand we are allowed to use.
9486Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9487 int OpIndices[3]) const {
9488 const MCInstrDesc &Desc = MI.getDesc();
9489
9490 // Find the one SGPR operand we are allowed to use.
9491 //
9492 // First we need to consider the instruction's operand requirements before
9493 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9494 // of VCC, but we are still bound by the constant bus requirement to only use
9495 // one.
9496 //
9497 // If the operand's class is an SGPR, we can never move it.
9498
9499 Register SGPRReg = findImplicitSGPRRead(MI);
9500 if (SGPRReg)
9501 return SGPRReg;
9502
9503 Register UsedSGPRs[3] = {Register()};
9504 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9505
9506 for (unsigned i = 0; i < 3; ++i) {
9507 int Idx = OpIndices[i];
9508 if (Idx == -1)
9509 break;
9510
9511 const MachineOperand &MO = MI.getOperand(Idx);
9512 if (!MO.isReg())
9513 continue;
9514
9515 // Is this operand statically required to be an SGPR based on the operand
9516 // constraints?
9517 const TargetRegisterClass *OpRC =
9518 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9519 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9520 if (IsRequiredSGPR)
9521 return MO.getReg();
9522
9523 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9524 Register Reg = MO.getReg();
9525 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9526 if (RI.isSGPRClass(RegRC))
9527 UsedSGPRs[i] = Reg;
9528 }
9529
9530 // We don't have a required SGPR operand, so we have a bit more freedom in
9531 // selecting operands to move.
9532
9533 // Try to select the most used SGPR. If an SGPR is equal to one of the
9534 // others, we choose that.
9535 //
9536 // e.g.
9537 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9538 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9539
9540 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9541 // prefer those.
9542
9543 if (UsedSGPRs[0]) {
9544 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9545 SGPRReg = UsedSGPRs[0];
9546 }
9547
9548 if (!SGPRReg && UsedSGPRs[1]) {
9549 if (UsedSGPRs[1] == UsedSGPRs[2])
9550 SGPRReg = UsedSGPRs[1];
9551 }
9552
9553 return SGPRReg;
9554}
9555
9557 AMDGPU::OpName OperandName) const {
9558 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9559 return nullptr;
9560
9561 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9562 if (Idx == -1)
9563 return nullptr;
9564
9565 return &MI.getOperand(Idx);
9566}
9567
9569 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9570 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9573 return (Format << 44) |
9574 (1ULL << 56) | // RESOURCE_LEVEL = 1
9575 (3ULL << 60); // OOB_SELECT = 3
9576 }
9577
9578 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9579 if (ST.isAmdHsaOS()) {
9580 // Set ATC = 1. GFX9 doesn't have this bit.
9581 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9582 RsrcDataFormat |= (1ULL << 56);
9583
9584 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9585 // BTW, it disables TC L2 and therefore decreases performance.
9586 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9587 RsrcDataFormat |= (2ULL << 59);
9588 }
9589
9590 return RsrcDataFormat;
9591}
9592
9596 0xffffffff; // Size;
9597
9598 // GFX9 doesn't have ELEMENT_SIZE.
9599 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9600 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9601 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9602 }
9603
9604 // IndexStride = 64 / 32.
9605 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9606 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9607
9608 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9609 // Clear them unless we want a huge stride.
9610 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9611 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9612 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9613
9614 return Rsrc23;
9615}
9616
9618 unsigned Opc = MI.getOpcode();
9619
9620 return isSMRD(Opc);
9621}
9622
9624 return get(Opc).mayLoad() &&
9625 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9626}
9627
9629 int &FrameIndex) const {
9630 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9631 if (!Addr || !Addr->isFI())
9632 return Register();
9633
9634 assert(!MI.memoperands_empty() &&
9635 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9636
9637 FrameIndex = Addr->getIndex();
9638 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9639}
9640
9642 int &FrameIndex) const {
9643 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9644 assert(Addr && Addr->isFI());
9645 FrameIndex = Addr->getIndex();
9646 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9647}
9648
9650 int &FrameIndex) const {
9651 if (!MI.mayLoad())
9652 return Register();
9653
9654 if (isMUBUF(MI) || isVGPRSpill(MI))
9655 return isStackAccess(MI, FrameIndex);
9656
9657 if (isSGPRSpill(MI))
9658 return isSGPRStackAccess(MI, FrameIndex);
9659
9660 return Register();
9661}
9662
9664 int &FrameIndex) const {
9665 if (!MI.mayStore())
9666 return Register();
9667
9668 if (isMUBUF(MI) || isVGPRSpill(MI))
9669 return isStackAccess(MI, FrameIndex);
9670
9671 if (isSGPRSpill(MI))
9672 return isSGPRStackAccess(MI, FrameIndex);
9673
9674 return Register();
9675}
9676
9678 unsigned Size = 0;
9680 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9681 while (++I != E && I->isInsideBundle()) {
9682 assert(!I->isBundle() && "No nested bundle!");
9684 }
9685
9686 return Size;
9687}
9688
9690 unsigned Opc = MI.getOpcode();
9692 unsigned DescSize = Desc.getSize();
9693
9694 // If we have a definitive size, we can use it. Otherwise we need to inspect
9695 // the operands to know the size.
9696 if (isFixedSize(MI)) {
9697 unsigned Size = DescSize;
9698
9699 // If we hit the buggy offset, an extra nop will be inserted in MC so
9700 // estimate the worst case.
9701 if (MI.isBranch() && ST.hasOffset3fBug())
9702 Size += 4;
9703
9704 return Size;
9705 }
9706
9707 // Instructions may have a 32-bit literal encoded after them. Check
9708 // operands that could ever be literals.
9709 if (isVALU(MI) || isSALU(MI)) {
9710 if (isDPP(MI))
9711 return DescSize;
9712 bool HasLiteral = false;
9713 unsigned LiteralSize = 4;
9714 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9715 const MachineOperand &Op = MI.getOperand(I);
9716 const MCOperandInfo &OpInfo = Desc.operands()[I];
9717 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9718 HasLiteral = true;
9719 if (ST.has64BitLiterals()) {
9720 switch (OpInfo.OperandType) {
9721 default:
9722 break;
9724 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9725 LiteralSize = 8;
9726 break;
9728 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9729 LiteralSize = 8;
9730 break;
9731 }
9732 }
9733 break;
9734 }
9735 }
9736 return HasLiteral ? DescSize + LiteralSize : DescSize;
9737 }
9738
9739 // Check whether we have extra NSA words.
9740 if (isMIMG(MI)) {
9741 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9742 if (VAddr0Idx < 0)
9743 return 8;
9744
9745 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9746 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9747 }
9748
9749 switch (Opc) {
9750 case TargetOpcode::BUNDLE:
9751 return getInstBundleSize(MI);
9752 case TargetOpcode::INLINEASM:
9753 case TargetOpcode::INLINEASM_BR: {
9754 const MachineFunction *MF = MI.getMF();
9755 const char *AsmStr = MI.getOperand(0).getSymbolName();
9756 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9757 }
9758 default:
9759 if (MI.isMetaInstruction())
9760 return 0;
9761
9762 // If D16 Pseudo inst, get correct MC code size
9763 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9764 if (D16Info) {
9765 // Assume d16_lo/hi inst are always in same size
9766 unsigned LoInstOpcode = D16Info->LoOp;
9767 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9768 DescSize = Desc.getSize();
9769 }
9770
9771 // If FMA Pseudo inst, get correct MC code size
9772 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9773 // All potential lowerings are the same size; arbitrarily pick one.
9774 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9775 DescSize = Desc.getSize();
9776 }
9777
9778 return DescSize;
9779 }
9780}
9781
9783 if (!isFLAT(MI))
9784 return false;
9785
9786 if (MI.memoperands_empty())
9787 return true;
9788
9789 for (const MachineMemOperand *MMO : MI.memoperands()) {
9790 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9791 return true;
9792 }
9793 return false;
9794}
9795
9798 static const std::pair<int, const char *> TargetIndices[] = {
9799 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9800 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9801 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9802 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9803 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9804 return ArrayRef(TargetIndices);
9805}
9806
9807/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9808/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9814
9815/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9816/// pass.
9821
9822// Called during:
9823// - pre-RA scheduling and post-RA scheduling
9826 const ScheduleDAGMI *DAG) const {
9827 // Borrowed from Arm Target
9828 // We would like to restrict this hazard recognizer to only
9829 // post-RA scheduling; we can tell that we're post-RA because we don't
9830 // track VRegLiveness.
9831 if (!DAG->hasVRegLiveness())
9832 return new GCNHazardRecognizer(DAG->MF);
9834}
9835
9836std::pair<unsigned, unsigned>
9838 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9839}
9840
9843 static const std::pair<unsigned, const char *> TargetFlags[] = {
9844 {MO_GOTPCREL, "amdgpu-gotprel"},
9845 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9846 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9847 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9848 {MO_REL32_LO, "amdgpu-rel32-lo"},
9849 {MO_REL32_HI, "amdgpu-rel32-hi"},
9850 {MO_REL64, "amdgpu-rel64"},
9851 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9852 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9853 {MO_ABS64, "amdgpu-abs64"},
9854 };
9855
9856 return ArrayRef(TargetFlags);
9857}
9858
9861 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9862 {
9863 {MONoClobber, "amdgpu-noclobber"},
9864 {MOLastUse, "amdgpu-last-use"},
9865 {MOCooperative, "amdgpu-cooperative"},
9866 };
9867
9868 return ArrayRef(TargetFlags);
9869}
9870
9872 const MachineFunction &MF) const {
9874 assert(SrcReg.isVirtual());
9875 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9876 return AMDGPU::WWM_COPY;
9877
9878 return AMDGPU::COPY;
9879}
9880
9882 Register Reg) const {
9883 // We need to handle instructions which may be inserted during register
9884 // allocation to handle the prolog. The initial prolog instruction may have
9885 // been separated from the start of the block by spills and copies inserted
9886 // needed by the prolog. However, the insertions for scalar registers can
9887 // always be placed at the BB top as they are independent of the exec mask
9888 // value.
9889 const MachineFunction *MF = MI.getMF();
9890 bool IsNullOrVectorRegister = true;
9891 if (Reg) {
9892 const MachineRegisterInfo &MRI = MF->getRegInfo();
9893 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9894 }
9895
9896 uint16_t Opcode = MI.getOpcode();
9898 return IsNullOrVectorRegister &&
9899 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9900 (Opcode == AMDGPU::IMPLICIT_DEF &&
9901 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
9902 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9903 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9904}
9905
9909 const DebugLoc &DL,
9910 Register DestReg) const {
9911 if (ST.hasAddNoCarry())
9912 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9913
9914 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9915 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9916 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9917
9918 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9919 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9920}
9921
9924 const DebugLoc &DL,
9925 Register DestReg,
9926 RegScavenger &RS) const {
9927 if (ST.hasAddNoCarry())
9928 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9929
9930 // If available, prefer to use vcc.
9931 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9932 ? Register(RI.getVCC())
9933 : RS.scavengeRegisterBackwards(
9934 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9935 0, /* AllowSpill */ false);
9936
9937 // TODO: Users need to deal with this.
9938 if (!UnusedCarry.isValid())
9939 return MachineInstrBuilder();
9940
9941 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9942 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9943}
9944
9945bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9946 switch (Opcode) {
9947 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9948 case AMDGPU::SI_KILL_I1_TERMINATOR:
9949 return true;
9950 default:
9951 return false;
9952 }
9953}
9954
9956 switch (Opcode) {
9957 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9958 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9959 case AMDGPU::SI_KILL_I1_PSEUDO:
9960 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9961 default:
9962 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9963 }
9964}
9965
9966bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9967 return Imm <= getMaxMUBUFImmOffset(ST);
9968}
9969
9971 // GFX12 field is non-negative 24-bit signed byte offset.
9972 const unsigned OffsetBits =
9973 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9974 return (1 << OffsetBits) - 1;
9975}
9976
9978 if (!ST.isWave32())
9979 return;
9980
9981 if (MI.isInlineAsm())
9982 return;
9983
9984 for (auto &Op : MI.implicit_operands()) {
9985 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9986 Op.setReg(AMDGPU::VCC_LO);
9987 }
9988}
9989
9991 if (!isSMRD(MI))
9992 return false;
9993
9994 // Check that it is using a buffer resource.
9995 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9996 if (Idx == -1) // e.g. s_memtime
9997 return false;
9998
9999 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
10000 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
10001}
10002
10003// Given Imm, split it into the values to put into the SOffset and ImmOffset
10004// fields in an MUBUF instruction. Return false if it is not possible (due to a
10005// hardware bug needing a workaround).
10006//
10007// The required alignment ensures that individual address components remain
10008// aligned if they are aligned to begin with. It also ensures that additional
10009// offsets within the given alignment can be added to the resulting ImmOffset.
10011 uint32_t &ImmOffset, Align Alignment) const {
10012 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10013 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
10014 uint32_t Overflow = 0;
10015
10016 if (Imm > MaxImm) {
10017 if (Imm <= MaxImm + 64) {
10018 // Use an SOffset inline constant for 4..64
10019 Overflow = Imm - MaxImm;
10020 Imm = MaxImm;
10021 } else {
10022 // Try to keep the same value in SOffset for adjacent loads, so that
10023 // the corresponding register contents can be re-used.
10024 //
10025 // Load values with all low-bits (except for alignment bits) set into
10026 // SOffset, so that a larger range of values can be covered using
10027 // s_movk_i32.
10028 //
10029 // Atomic operations fail to work correctly when individual address
10030 // components are unaligned, even if their sum is aligned.
10031 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10032 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10033 Imm = Low;
10034 Overflow = High - Alignment.value();
10035 }
10036 }
10037
10038 if (Overflow > 0) {
10039 // There is a hardware bug in SI and CI which prevents address clamping in
10040 // MUBUF instructions from working correctly with SOffsets. The immediate
10041 // offset is unaffected.
10042 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10043 return false;
10044
10045 // It is not possible to set immediate in SOffset field on some targets.
10046 if (ST.hasRestrictedSOffset())
10047 return false;
10048 }
10049
10050 ImmOffset = Imm;
10051 SOffset = Overflow;
10052 return true;
10053}
10054
10055// Depending on the used address space and instructions, some immediate offsets
10056// are allowed and some are not.
10057// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10058// scratch instruction offsets can also be negative. On GFX12, offsets can be
10059// negative for all variants.
10060//
10061// There are several bugs related to these offsets:
10062// On gfx10.1, flat instructions that go into the global address space cannot
10063// use an offset.
10064//
10065// For scratch instructions, the address can be either an SGPR or a VGPR.
10066// The following offsets can be used, depending on the architecture (x means
10067// cannot be used):
10068// +----------------------------+------+------+
10069// | Address-Mode | SGPR | VGPR |
10070// +----------------------------+------+------+
10071// | gfx9 | | |
10072// | negative, 4-aligned offset | x | ok |
10073// | negative, unaligned offset | x | ok |
10074// +----------------------------+------+------+
10075// | gfx10 | | |
10076// | negative, 4-aligned offset | ok | ok |
10077// | negative, unaligned offset | ok | x |
10078// +----------------------------+------+------+
10079// | gfx10.3 | | |
10080// | negative, 4-aligned offset | ok | ok |
10081// | negative, unaligned offset | ok | ok |
10082// +----------------------------+------+------+
10083//
10084// This function ignores the addressing mode, so if an offset cannot be used in
10085// one addressing mode, it is considered illegal.
10086bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10087 uint64_t FlatVariant) const {
10088 // TODO: Should 0 be special cased?
10089 if (!ST.hasFlatInstOffsets())
10090 return false;
10091
10092 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10093 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10094 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10095 return false;
10096
10097 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10098 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10099 (Offset % 4) != 0) {
10100 return false;
10101 }
10102
10103 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10104 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10105 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
10106}
10107
10108// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10109std::pair<int64_t, int64_t>
10110SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10111 uint64_t FlatVariant) const {
10112 int64_t RemainderOffset = COffsetVal;
10113 int64_t ImmField = 0;
10114
10115 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10116 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10117
10118 if (AllowNegative) {
10119 // Use signed division by a power of two to truncate towards 0.
10120 int64_t D = 1LL << NumBits;
10121 RemainderOffset = (COffsetVal / D) * D;
10122 ImmField = COffsetVal - RemainderOffset;
10123
10124 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10125 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10126 (ImmField % 4) != 0) {
10127 // Make ImmField a multiple of 4
10128 RemainderOffset += ImmField % 4;
10129 ImmField -= ImmField % 4;
10130 }
10131 } else if (COffsetVal >= 0) {
10132 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
10133 RemainderOffset = COffsetVal - ImmField;
10134 }
10135
10136 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10137 assert(RemainderOffset + ImmField == COffsetVal);
10138 return {ImmField, RemainderOffset};
10139}
10140
10142 if (ST.hasNegativeScratchOffsetBug() &&
10143 FlatVariant == SIInstrFlags::FlatScratch)
10144 return false;
10145
10146 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
10147}
10148
10149static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10150 switch (ST.getGeneration()) {
10151 default:
10152 break;
10155 return SIEncodingFamily::SI;
10158 return SIEncodingFamily::VI;
10164 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10166 }
10167 llvm_unreachable("Unknown subtarget generation!");
10168}
10169
10170bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10171 switch(MCOp) {
10172 // These opcodes use indirect register addressing so
10173 // they need special handling by codegen (currently missing).
10174 // Therefore it is too risky to allow these opcodes
10175 // to be selected by dpp combiner or sdwa peepholer.
10176 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10177 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10178 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10179 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10180 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10181 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10182 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10183 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10184 return true;
10185 default:
10186 return false;
10187 }
10188}
10189
10190#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10191 case OPCODE##_dpp: \
10192 case OPCODE##_e32: \
10193 case OPCODE##_e64: \
10194 case OPCODE##_e64_dpp: \
10195 case OPCODE##_sdwa:
10196
10197static bool isRenamedInGFX9(int Opcode) {
10198 switch (Opcode) {
10199 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10200 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10201 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10202 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10203 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10204 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10205 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10206 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10207 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10208 //
10209 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10210 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10211 case AMDGPU::V_FMA_F16_gfx9_e64:
10212 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10213 case AMDGPU::V_INTERP_P2_F16:
10214 case AMDGPU::V_MAD_F16_e64:
10215 case AMDGPU::V_MAD_U16_e64:
10216 case AMDGPU::V_MAD_I16_e64:
10217 return true;
10218 default:
10219 return false;
10220 }
10221}
10222
10223int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10224 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
10225
10226 unsigned Gen = subtargetEncodingFamily(ST);
10227
10228 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10230
10231 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10232 // subtarget has UnpackedD16VMem feature.
10233 // TODO: remove this when we discard GFX80 encoding.
10234 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10236
10237 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10238 switch (ST.getGeneration()) {
10239 default:
10241 break;
10244 break;
10247 break;
10248 }
10249 }
10250
10251 if (isMAI(Opcode)) {
10252 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10253 if (MFMAOp != -1)
10254 Opcode = MFMAOp;
10255 }
10256
10257 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10258
10259 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
10261
10262 // -1 means that Opcode is already a native instruction.
10263 if (MCOp == -1)
10264 return Opcode;
10265
10266 if (ST.hasGFX90AInsts()) {
10267 uint16_t NMCOp = (uint16_t)-1;
10268 if (ST.hasGFX940Insts())
10270 if (NMCOp == (uint16_t)-1)
10272 if (NMCOp == (uint16_t)-1)
10274 if (NMCOp != (uint16_t)-1)
10275 MCOp = NMCOp;
10276 }
10277
10278 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
10279 // no encoding in the given subtarget generation.
10280 if (MCOp == (uint16_t)-1)
10281 return -1;
10282
10283 if (isAsmOnlyOpcode(MCOp))
10284 return -1;
10285
10286 return MCOp;
10287}
10288
10289static
10291 assert(RegOpnd.isReg());
10292 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10293 getRegSubRegPair(RegOpnd);
10294}
10295
10298 assert(MI.isRegSequence());
10299 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10300 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10301 auto &RegOp = MI.getOperand(1 + 2 * I);
10302 return getRegOrUndef(RegOp);
10303 }
10305}
10306
10307// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10308// Following a subreg of reg:subreg isn't supported
10311 if (!RSR.SubReg)
10312 return false;
10313 switch (MI.getOpcode()) {
10314 default: break;
10315 case AMDGPU::REG_SEQUENCE:
10316 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10317 return true;
10318 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10319 case AMDGPU::INSERT_SUBREG:
10320 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10321 // inserted the subreg we're looking for
10322 RSR = getRegOrUndef(MI.getOperand(2));
10323 else { // the subreg in the rest of the reg
10324 auto R1 = getRegOrUndef(MI.getOperand(1));
10325 if (R1.SubReg) // subreg of subreg isn't supported
10326 return false;
10327 RSR.Reg = R1.Reg;
10328 }
10329 return true;
10330 }
10331 return false;
10332}
10333
10335 const MachineRegisterInfo &MRI) {
10336 assert(MRI.isSSA());
10337 if (!P.Reg.isVirtual())
10338 return nullptr;
10339
10340 auto RSR = P;
10341 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10342 while (auto *MI = DefInst) {
10343 DefInst = nullptr;
10344 switch (MI->getOpcode()) {
10345 case AMDGPU::COPY:
10346 case AMDGPU::V_MOV_B32_e32: {
10347 auto &Op1 = MI->getOperand(1);
10348 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10349 if (Op1.isUndef())
10350 return nullptr;
10351 RSR = getRegSubRegPair(Op1);
10352 DefInst = MRI.getVRegDef(RSR.Reg);
10353 }
10354 break;
10355 }
10356 default:
10357 if (followSubRegDef(*MI, RSR)) {
10358 if (!RSR.Reg)
10359 return nullptr;
10360 DefInst = MRI.getVRegDef(RSR.Reg);
10361 }
10362 }
10363 if (!DefInst)
10364 return MI;
10365 }
10366 return nullptr;
10367}
10368
10370 Register VReg,
10371 const MachineInstr &DefMI,
10372 const MachineInstr &UseMI) {
10373 assert(MRI.isSSA() && "Must be run on SSA");
10374
10375 auto *TRI = MRI.getTargetRegisterInfo();
10376 auto *DefBB = DefMI.getParent();
10377
10378 // Don't bother searching between blocks, although it is possible this block
10379 // doesn't modify exec.
10380 if (UseMI.getParent() != DefBB)
10381 return true;
10382
10383 const int MaxInstScan = 20;
10384 int NumInst = 0;
10385
10386 // Stop scan at the use.
10387 auto E = UseMI.getIterator();
10388 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10389 if (I->isDebugInstr())
10390 continue;
10391
10392 if (++NumInst > MaxInstScan)
10393 return true;
10394
10395 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10396 return true;
10397 }
10398
10399 return false;
10400}
10401
10403 Register VReg,
10404 const MachineInstr &DefMI) {
10405 assert(MRI.isSSA() && "Must be run on SSA");
10406
10407 auto *TRI = MRI.getTargetRegisterInfo();
10408 auto *DefBB = DefMI.getParent();
10409
10410 const int MaxUseScan = 10;
10411 int NumUse = 0;
10412
10413 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10414 auto &UseInst = *Use.getParent();
10415 // Don't bother searching between blocks, although it is possible this block
10416 // doesn't modify exec.
10417 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10418 return true;
10419
10420 if (++NumUse > MaxUseScan)
10421 return true;
10422 }
10423
10424 if (NumUse == 0)
10425 return false;
10426
10427 const int MaxInstScan = 20;
10428 int NumInst = 0;
10429
10430 // Stop scan when we have seen all the uses.
10431 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10432 assert(I != DefBB->end());
10433
10434 if (I->isDebugInstr())
10435 continue;
10436
10437 if (++NumInst > MaxInstScan)
10438 return true;
10439
10440 for (const MachineOperand &Op : I->operands()) {
10441 // We don't check reg masks here as they're used only on calls:
10442 // 1. EXEC is only considered const within one BB
10443 // 2. Call should be a terminator instruction if present in a BB
10444
10445 if (!Op.isReg())
10446 continue;
10447
10448 Register Reg = Op.getReg();
10449 if (Op.isUse()) {
10450 if (Reg == VReg && --NumUse == 0)
10451 return false;
10452 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10453 return true;
10454 }
10455 }
10456}
10457
10460 const DebugLoc &DL, Register Src, Register Dst) const {
10461 auto Cur = MBB.begin();
10462 if (Cur != MBB.end())
10463 do {
10464 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10465 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10466 ++Cur;
10467 } while (Cur != MBB.end() && Cur != LastPHIIt);
10468
10469 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10470 Dst);
10471}
10472
10475 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10476 if (InsPt != MBB.end() &&
10477 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10478 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10479 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10480 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10481 InsPt++;
10482 return BuildMI(MBB, InsPt, DL,
10483 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10484 .addReg(Src, 0, SrcSubReg)
10485 .addReg(AMDGPU::EXEC, RegState::Implicit);
10486 }
10487 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10488 Dst);
10489}
10490
10491bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10492
10495 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10496 VirtRegMap *VRM) const {
10497 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10498 //
10499 // %0:sreg_32 = COPY $m0
10500 //
10501 // We explicitly chose SReg_32 for the virtual register so such a copy might
10502 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10503 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10504 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10505 // TargetInstrInfo::foldMemoryOperand() is going to try.
10506 // A similar issue also exists with spilling and reloading $exec registers.
10507 //
10508 // To prevent that, constrain the %0 register class here.
10509 if (isFullCopyInstr(MI)) {
10510 Register DstReg = MI.getOperand(0).getReg();
10511 Register SrcReg = MI.getOperand(1).getReg();
10512 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10513 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10515 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10516 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10517 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10518 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10519 return nullptr;
10520 }
10521 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10522 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10523 return nullptr;
10524 }
10525 }
10526 }
10527
10528 return nullptr;
10529}
10530
10532 const MachineInstr &MI,
10533 unsigned *PredCost) const {
10534 if (MI.isBundle()) {
10536 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10537 unsigned Lat = 0, Count = 0;
10538 for (++I; I != E && I->isBundledWithPred(); ++I) {
10539 ++Count;
10540 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10541 }
10542 return Lat + Count - 1;
10543 }
10544
10545 return SchedModel.computeInstrLatency(&MI);
10546}
10547
10550 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10551 unsigned Opcode = MI.getOpcode();
10552
10553 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10554 Register Dst = MI.getOperand(0).getReg();
10555 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10556 : MI.getOperand(1).getReg();
10557 LLT DstTy = MRI.getType(Dst);
10558 LLT SrcTy = MRI.getType(Src);
10559 unsigned DstAS = DstTy.getAddressSpace();
10560 unsigned SrcAS = SrcTy.getAddressSpace();
10561 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10562 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10563 ST.hasGloballyAddressableScratch()
10566 };
10567
10568 // If the target supports globally addressable scratch, the mapping from
10569 // scratch memory to the flat aperture changes therefore an address space cast
10570 // is no longer uniform.
10571 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10572 return HandleAddrSpaceCast(MI);
10573
10574 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10575 auto IID = GI->getIntrinsicID();
10580
10581 switch (IID) {
10582 case Intrinsic::amdgcn_addrspacecast_nonnull:
10583 return HandleAddrSpaceCast(MI);
10584 case Intrinsic::amdgcn_if:
10585 case Intrinsic::amdgcn_else:
10586 // FIXME: Uniform if second result
10587 break;
10588 }
10589
10591 }
10592
10593 // Loads from the private and flat address spaces are divergent, because
10594 // threads can execute the load instruction with the same inputs and get
10595 // different results.
10596 //
10597 // All other loads are not divergent, because if threads issue loads with the
10598 // same arguments, they will always get the same result.
10599 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10600 Opcode == AMDGPU::G_SEXTLOAD) {
10601 if (MI.memoperands_empty())
10602 return InstructionUniformity::NeverUniform; // conservative assumption
10603
10604 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10605 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10606 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10607 })) {
10608 // At least one MMO in a non-global address space.
10610 }
10612 }
10613
10614 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10615 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10616 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10617 AMDGPU::isGenericAtomic(Opcode)) {
10619 }
10621}
10622
10625
10626 if (isNeverUniform(MI))
10628
10629 unsigned opcode = MI.getOpcode();
10630 if (opcode == AMDGPU::V_READLANE_B32 ||
10631 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10632 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10634
10635 if (isCopyInstr(MI)) {
10636 const MachineOperand &srcOp = MI.getOperand(1);
10637 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10638 const TargetRegisterClass *regClass =
10639 RI.getPhysRegBaseClass(srcOp.getReg());
10640 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10642 }
10644 }
10645
10646 // GMIR handling
10647 if (MI.isPreISelOpcode())
10649
10650 // Atomics are divergent because they are executed sequentially: when an
10651 // atomic operation refers to the same address in each thread, then each
10652 // thread after the first sees the value written by the previous thread as
10653 // original value.
10654
10655 if (isAtomic(MI))
10657
10658 // Loads from the private and flat address spaces are divergent, because
10659 // threads can execute the load instruction with the same inputs and get
10660 // different results.
10661 if (isFLAT(MI) && MI.mayLoad()) {
10662 if (MI.memoperands_empty())
10663 return InstructionUniformity::NeverUniform; // conservative assumption
10664
10665 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10666 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10667 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10668 })) {
10669 // At least one MMO in a non-global address space.
10671 }
10672
10674 }
10675
10676 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10677 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10678
10679 // FIXME: It's conceptually broken to report this for an instruction, and not
10680 // a specific def operand. For inline asm in particular, there could be mixed
10681 // uniform and divergent results.
10682 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10683 const MachineOperand &SrcOp = MI.getOperand(I);
10684 if (!SrcOp.isReg())
10685 continue;
10686
10687 Register Reg = SrcOp.getReg();
10688 if (!Reg || !SrcOp.readsReg())
10689 continue;
10690
10691 // If RegBank is null, this is unassigned or an unallocatable special
10692 // register, which are all scalars.
10693 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10694 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10696 }
10697
10698 // TODO: Uniformity check condtions above can be rearranged for more
10699 // redability
10700
10701 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10702 // currently turned into no-op COPYs by SelectionDAG ISel and are
10703 // therefore no longer recognizable.
10704
10706}
10707
10709 switch (MF.getFunction().getCallingConv()) {
10711 return 1;
10713 return 2;
10715 return 3;
10719 const Function &F = MF.getFunction();
10720 F.getContext().diagnose(DiagnosticInfoUnsupported(
10721 F, "ds_ordered_count unsupported for this calling conv"));
10722 [[fallthrough]];
10723 }
10726 case CallingConv::C:
10727 case CallingConv::Fast:
10728 default:
10729 // Assume other calling conventions are various compute callable functions
10730 return 0;
10731 }
10732}
10733
10735 Register &SrcReg2, int64_t &CmpMask,
10736 int64_t &CmpValue) const {
10737 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10738 return false;
10739
10740 switch (MI.getOpcode()) {
10741 default:
10742 break;
10743 case AMDGPU::S_CMP_EQ_U32:
10744 case AMDGPU::S_CMP_EQ_I32:
10745 case AMDGPU::S_CMP_LG_U32:
10746 case AMDGPU::S_CMP_LG_I32:
10747 case AMDGPU::S_CMP_LT_U32:
10748 case AMDGPU::S_CMP_LT_I32:
10749 case AMDGPU::S_CMP_GT_U32:
10750 case AMDGPU::S_CMP_GT_I32:
10751 case AMDGPU::S_CMP_LE_U32:
10752 case AMDGPU::S_CMP_LE_I32:
10753 case AMDGPU::S_CMP_GE_U32:
10754 case AMDGPU::S_CMP_GE_I32:
10755 case AMDGPU::S_CMP_EQ_U64:
10756 case AMDGPU::S_CMP_LG_U64:
10757 SrcReg = MI.getOperand(0).getReg();
10758 if (MI.getOperand(1).isReg()) {
10759 if (MI.getOperand(1).getSubReg())
10760 return false;
10761 SrcReg2 = MI.getOperand(1).getReg();
10762 CmpValue = 0;
10763 } else if (MI.getOperand(1).isImm()) {
10764 SrcReg2 = Register();
10765 CmpValue = MI.getOperand(1).getImm();
10766 } else {
10767 return false;
10768 }
10769 CmpMask = ~0;
10770 return true;
10771 case AMDGPU::S_CMPK_EQ_U32:
10772 case AMDGPU::S_CMPK_EQ_I32:
10773 case AMDGPU::S_CMPK_LG_U32:
10774 case AMDGPU::S_CMPK_LG_I32:
10775 case AMDGPU::S_CMPK_LT_U32:
10776 case AMDGPU::S_CMPK_LT_I32:
10777 case AMDGPU::S_CMPK_GT_U32:
10778 case AMDGPU::S_CMPK_GT_I32:
10779 case AMDGPU::S_CMPK_LE_U32:
10780 case AMDGPU::S_CMPK_LE_I32:
10781 case AMDGPU::S_CMPK_GE_U32:
10782 case AMDGPU::S_CMPK_GE_I32:
10783 SrcReg = MI.getOperand(0).getReg();
10784 SrcReg2 = Register();
10785 CmpValue = MI.getOperand(1).getImm();
10786 CmpMask = ~0;
10787 return true;
10788 }
10789
10790 return false;
10791}
10792
10793// SCC is already valid after SCCValid.
10794// SCCRedefine will redefine SCC to the same value already available after
10795// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
10796// update kill/dead flags if necessary.
10797static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
10798 const SIRegisterInfo &RI) {
10799 MachineInstr *KillsSCC = nullptr;
10800 if (SCCValid->getParent() != SCCRedefine->getParent())
10801 return false;
10802 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
10803 SCCRedefine->getIterator())) {
10804 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
10805 return false;
10806 if (MI.killsRegister(AMDGPU::SCC, &RI))
10807 KillsSCC = &MI;
10808 }
10809 if (MachineOperand *SccDef =
10810 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
10811 SccDef->setIsDead(false);
10812 if (KillsSCC)
10813 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
10814 SCCRedefine->eraseFromParent();
10815 return true;
10816}
10817
10818static bool foldableSelect(const MachineInstr &Def) {
10819 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
10820 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
10821 return false;
10822 bool Op1IsNonZeroImm =
10823 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
10824 bool Op2IsZeroImm =
10825 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
10826 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
10827 return false;
10828 return true;
10829}
10830
10832 Register SrcReg2, int64_t CmpMask,
10833 int64_t CmpValue,
10834 const MachineRegisterInfo *MRI) const {
10835 if (!SrcReg || SrcReg.isPhysical())
10836 return false;
10837
10838 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10839 return false;
10840
10841 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
10842 this]() -> bool {
10843 if (CmpValue != 0)
10844 return false;
10845
10846 MachineInstr *Def = MRI->getVRegDef(SrcReg);
10847 if (!Def)
10848 return false;
10849
10850 // For S_OP that set SCC = DST!=0, do the transformation
10851 //
10852 // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
10853
10854 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
10855 // for S_CSELECT* already has the same value that will be calculated by
10856 // s_cmp_lg_*
10857 //
10858 // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
10859 // imm), 0)
10860 if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def))
10861 return false;
10862
10863 if (!optimizeSCC(Def, &CmpInstr, RI))
10864 return false;
10865
10866 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
10867 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
10868 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
10869 // sX = s_cselect_b64 (non-zero imm), 0
10870 // sLo = copy sX.sub0
10871 // sHi = copy sX.sub1
10872 // sY = s_or_b32 sLo, sHi
10873 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
10874 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
10875 const MachineOperand &OrOpnd1 = Def->getOperand(1);
10876 const MachineOperand &OrOpnd2 = Def->getOperand(2);
10877 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
10878 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
10879 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
10880 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
10881 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
10882 Def2->getOperand(1).isReg() &&
10883 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
10884 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
10885 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
10886 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
10887 if (Select && foldableSelect(*Select))
10888 optimizeSCC(Select, Def, RI);
10889 }
10890 }
10891 }
10892 return true;
10893 };
10894
10895 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
10896 this](int64_t ExpectedValue, unsigned SrcSize,
10897 bool IsReversible, bool IsSigned) -> bool {
10898 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10899 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10900 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10901 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10902 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
10903 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10904 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10905 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10906 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10907 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
10908 //
10909 // Signed ge/gt are not used for the sign bit.
10910 //
10911 // If result of the AND is unused except in the compare:
10912 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
10913 //
10914 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10915 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10916 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
10917 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10918 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10919 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
10920
10921 MachineInstr *Def = MRI->getVRegDef(SrcReg);
10922 if (!Def)
10923 return false;
10924
10925 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
10926 Def->getOpcode() != AMDGPU::S_AND_B64)
10927 return false;
10928
10929 int64_t Mask;
10930 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
10931 if (MO->isImm())
10932 Mask = MO->getImm();
10933 else if (!getFoldableImm(MO, Mask))
10934 return false;
10935 Mask &= maxUIntN(SrcSize);
10936 return isPowerOf2_64(Mask);
10937 };
10938
10939 MachineOperand *SrcOp = &Def->getOperand(1);
10940 if (isMask(SrcOp))
10941 SrcOp = &Def->getOperand(2);
10942 else if (isMask(&Def->getOperand(2)))
10943 SrcOp = &Def->getOperand(1);
10944 else
10945 return false;
10946
10947 // A valid Mask is required to have a single bit set, hence a non-zero and
10948 // power-of-two value. This verifies that we will not do 64-bit shift below.
10949 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
10950 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
10951 if (IsSigned && BitNo == SrcSize - 1)
10952 return false;
10953
10954 ExpectedValue <<= BitNo;
10955
10956 bool IsReversedCC = false;
10957 if (CmpValue != ExpectedValue) {
10958 if (!IsReversible)
10959 return false;
10960 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
10961 if (!IsReversedCC)
10962 return false;
10963 }
10964
10965 Register DefReg = Def->getOperand(0).getReg();
10966 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
10967 return false;
10968
10969 if (!optimizeSCC(Def, &CmpInstr, RI))
10970 return false;
10971
10972 if (!MRI->use_nodbg_empty(DefReg)) {
10973 assert(!IsReversedCC);
10974 return true;
10975 }
10976
10977 // Replace AND with unused result with a S_BITCMP.
10978 MachineBasicBlock *MBB = Def->getParent();
10979
10980 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
10981 : AMDGPU::S_BITCMP1_B32
10982 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
10983 : AMDGPU::S_BITCMP1_B64;
10984
10985 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
10986 .add(*SrcOp)
10987 .addImm(BitNo);
10988 Def->eraseFromParent();
10989
10990 return true;
10991 };
10992
10993 switch (CmpInstr.getOpcode()) {
10994 default:
10995 break;
10996 case AMDGPU::S_CMP_EQ_U32:
10997 case AMDGPU::S_CMP_EQ_I32:
10998 case AMDGPU::S_CMPK_EQ_U32:
10999 case AMDGPU::S_CMPK_EQ_I32:
11000 return optimizeCmpAnd(1, 32, true, false);
11001 case AMDGPU::S_CMP_GE_U32:
11002 case AMDGPU::S_CMPK_GE_U32:
11003 return optimizeCmpAnd(1, 32, false, false);
11004 case AMDGPU::S_CMP_GE_I32:
11005 case AMDGPU::S_CMPK_GE_I32:
11006 return optimizeCmpAnd(1, 32, false, true);
11007 case AMDGPU::S_CMP_EQ_U64:
11008 return optimizeCmpAnd(1, 64, true, false);
11009 case AMDGPU::S_CMP_LG_U32:
11010 case AMDGPU::S_CMP_LG_I32:
11011 case AMDGPU::S_CMPK_LG_U32:
11012 case AMDGPU::S_CMPK_LG_I32:
11013 return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
11014 case AMDGPU::S_CMP_GT_U32:
11015 case AMDGPU::S_CMPK_GT_U32:
11016 return optimizeCmpAnd(0, 32, false, false);
11017 case AMDGPU::S_CMP_GT_I32:
11018 case AMDGPU::S_CMPK_GT_I32:
11019 return optimizeCmpAnd(0, 32, false, true);
11020 case AMDGPU::S_CMP_LG_U64:
11021 return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
11022 }
11023
11024 return false;
11025}
11026
11028 AMDGPU::OpName OpName) const {
11029 if (!ST.needsAlignedVGPRs())
11030 return;
11031
11032 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
11033 if (OpNo < 0)
11034 return;
11035 MachineOperand &Op = MI.getOperand(OpNo);
11036 if (getOpSize(MI, OpNo) > 4)
11037 return;
11038
11039 // Add implicit aligned super-reg to force alignment on the data operand.
11040 const DebugLoc &DL = MI.getDebugLoc();
11041 MachineBasicBlock *BB = MI.getParent();
11043 Register DataReg = Op.getReg();
11044 bool IsAGPR = RI.isAGPR(MRI, DataReg);
11045 Register Undef = MRI.createVirtualRegister(
11046 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11047 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
11048 Register NewVR =
11049 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11050 : &AMDGPU::VReg_64_Align2RegClass);
11051 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
11052 .addReg(DataReg, 0, Op.getSubReg())
11053 .addImm(AMDGPU::sub0)
11054 .addReg(Undef)
11055 .addImm(AMDGPU::sub1);
11056 Op.setReg(NewVR);
11057 Op.setSubReg(AMDGPU::sub0);
11058 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
11059}
11060
11062 if (isIGLP(*MI))
11063 return false;
11064
11066}
11067
11069 if (!isWMMA(MI) && !isSWMMAC(MI))
11070 return false;
11071
11072 if (AMDGPU::isGFX1250(ST))
11073 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
11074
11075 return true;
11076}
11077
11079 unsigned Opcode = MI.getOpcode();
11080
11081 if (AMDGPU::isGFX12Plus(ST))
11082 return isDOT(MI) || isXDLWMMA(MI);
11083
11084 if (!isMAI(MI) || isDGEMM(Opcode) ||
11085 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11086 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11087 return false;
11088
11089 if (!ST.hasGFX940Insts())
11090 return true;
11091
11092 return AMDGPU::getMAIIsGFX940XDL(Opcode);
11093}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, const SIRegisterInfo &RI)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:221
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasAddNoCarry() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool mayAccessScratch(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const final
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
static bool isBUF(const MachineInstr &MI)
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const override
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
void mutateAndCleanupImplicit(MachineInstr &MI, const MCInstrDesc &NewDesc) const
bool isAlwaysGDS(uint16_t Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
static bool usesLGKM_CNT(const MachineInstr &MI)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool setsSCCifResultIsNonZero(const MachineInstr &MI)
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:231
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:202
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:206
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:201
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:216
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:205
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:212
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:248
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:204
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:242
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:213
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:220
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:203
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:228
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:587
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:589
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:586
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:588
@ TI_CONSTDATA_START
Definition AMDGPU.h:585
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2484
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI VirtRegInfo AnalyzeVirtRegInBundle(MachineInstr &MI, Register Reg, SmallVectorImpl< std::pair< MachineInstr *, unsigned > > *Ops=nullptr)
AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses a virtual register.
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1909
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
MachineCycleInfo::CycleT MachineCycle
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:56
MachineInstr * top() const
Definition SIInstrInfo.h:61
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:80
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.
VirtRegInfo - Information about a virtual register used by a set of operands.
bool Reads
Reads - One of the operands read the virtual register.
bool Writes
Writes - One of the operands writes the virtual register.