LLVM 22.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
21#include "llvm/ADT/STLExtras.h"
32#include "llvm/IR/IntrinsicsAMDGPU.h"
33#include "llvm/MC/MCContext.h"
36
37using namespace llvm;
38
39#define DEBUG_TYPE "si-instr-info"
40
41#define GET_INSTRINFO_CTOR_DTOR
42#include "AMDGPUGenInstrInfo.inc"
43
44namespace llvm::AMDGPU {
45#define GET_D16ImageDimIntrinsics_IMPL
46#define GET_ImageDimIntrinsicTable_IMPL
47#define GET_RsrcIntrinsics_IMPL
48#include "AMDGPUGenSearchableTables.inc"
49} // namespace llvm::AMDGPU
50
51// Must be at least 4 to be able to branch over minimum unconditional branch
52// code. This is only for making it possible to write reasonably small tests for
53// long branches.
55BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
56 cl::desc("Restrict range of branch instructions (DEBUG)"));
57
59 "amdgpu-fix-16-bit-physreg-copies",
60 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
61 cl::init(true),
63
65 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
66 RI(ST), ST(ST) {
67 SchedModel.init(&ST);
68}
69
70//===----------------------------------------------------------------------===//
71// TargetInstrInfo callbacks
72//===----------------------------------------------------------------------===//
73
74static unsigned getNumOperandsNoGlue(SDNode *Node) {
75 unsigned N = Node->getNumOperands();
76 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
77 --N;
78 return N;
79}
80
81/// Returns true if both nodes have the same value for the given
82/// operand \p Op, or if both nodes do not have this operand.
84 AMDGPU::OpName OpName) {
85 unsigned Opc0 = N0->getMachineOpcode();
86 unsigned Opc1 = N1->getMachineOpcode();
87
88 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
89 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
90
91 if (Op0Idx == -1 && Op1Idx == -1)
92 return true;
93
94
95 if ((Op0Idx == -1 && Op1Idx != -1) ||
96 (Op1Idx == -1 && Op0Idx != -1))
97 return false;
98
99 // getNamedOperandIdx returns the index for the MachineInstr's operands,
100 // which includes the result as the first operand. We are indexing into the
101 // MachineSDNode's operands, so we need to skip the result operand to get
102 // the real index.
103 --Op0Idx;
104 --Op1Idx;
105
106 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
107}
108
109static bool canRemat(const MachineInstr &MI) {
110
114 return true;
115
116 if (SIInstrInfo::isSMRD(MI)) {
117 return !MI.memoperands_empty() &&
118 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
119 return MMO->isLoad() && MMO->isInvariant();
120 });
121 }
122
123 return false;
124}
125
127 const MachineInstr &MI) const {
128
129 if (canRemat(MI)) {
130 // Normally VALU use of exec would block the rematerialization, but that
131 // is OK in this case to have an implicit exec read as all VALU do.
132 // We really want all of the generic logic for this except for this.
133
134 // Another potential implicit use is mode register. The core logic of
135 // the RA will not attempt rematerialization if mode is set anywhere
136 // in the function, otherwise it is safe since mode is not changed.
137
138 // There is difference to generic method which does not allow
139 // rematerialization if there are virtual register uses. We allow this,
140 // therefore this method includes SOP instructions as well.
141 if (!MI.hasImplicitDef() &&
142 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
143 !MI.mayRaiseFPException())
144 return true;
145 }
146
148}
149
150// Returns true if the scalar result of a VALU instruction depends on exec.
151bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
152 // Ignore comparisons which are only used masked with exec.
153 // This allows some hoisting/sinking of VALU comparisons.
154 if (MI.isCompare()) {
155 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
156 if (!Dst)
157 return true;
158
159 Register DstReg = Dst->getReg();
160 if (!DstReg.isVirtual())
161 return true;
162
163 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
164 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
165 switch (Use.getOpcode()) {
166 case AMDGPU::S_AND_SAVEEXEC_B32:
167 case AMDGPU::S_AND_SAVEEXEC_B64:
168 break;
169 case AMDGPU::S_AND_B32:
170 case AMDGPU::S_AND_B64:
171 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
172 return true;
173 break;
174 default:
175 return true;
176 }
177 }
178 return false;
179 }
180
181 switch (MI.getOpcode()) {
182 default:
183 break;
184 case AMDGPU::V_READFIRSTLANE_B32:
185 return true;
186 }
187
188 return false;
189}
190
192 // Any implicit use of exec by VALU is not a real register read.
193 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
194 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
195}
196
198 MachineBasicBlock *SuccToSinkTo,
199 MachineCycleInfo *CI) const {
200 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
201 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
202 return true;
203
204 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
205 // Check if sinking of MI would create temporal divergent use.
206 for (auto Op : MI.uses()) {
207 if (Op.isReg() && Op.getReg().isVirtual() &&
208 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
209 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
210
211 // SgprDef defined inside cycle
212 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
213 if (FromCycle == nullptr)
214 continue;
215
216 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
217 // Check if there is a FromCycle that contains SgprDef's basic block but
218 // does not contain SuccToSinkTo and also has divergent exit condition.
219 while (FromCycle && !FromCycle->contains(ToCycle)) {
221 FromCycle->getExitingBlocks(ExitingBlocks);
222
223 // FromCycle has divergent exit condition.
224 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
225 if (hasDivergentBranch(ExitingBlock))
226 return false;
227 }
228
229 FromCycle = FromCycle->getParentCycle();
230 }
231 }
232 }
233
234 return true;
235}
236
238 int64_t &Offset0,
239 int64_t &Offset1) const {
240 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
241 return false;
242
243 unsigned Opc0 = Load0->getMachineOpcode();
244 unsigned Opc1 = Load1->getMachineOpcode();
245
246 // Make sure both are actually loads.
247 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
248 return false;
249
250 // A mayLoad instruction without a def is not a load. Likely a prefetch.
251 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
252 return false;
253
254 if (isDS(Opc0) && isDS(Opc1)) {
255
256 // FIXME: Handle this case:
257 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
258 return false;
259
260 // Check base reg.
261 if (Load0->getOperand(0) != Load1->getOperand(0))
262 return false;
263
264 // Skip read2 / write2 variants for simplicity.
265 // TODO: We should report true if the used offsets are adjacent (excluded
266 // st64 versions).
267 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
268 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
269 if (Offset0Idx == -1 || Offset1Idx == -1)
270 return false;
271
272 // XXX - be careful of dataless loads
273 // getNamedOperandIdx returns the index for MachineInstrs. Since they
274 // include the output in the operand list, but SDNodes don't, we need to
275 // subtract the index by one.
276 Offset0Idx -= get(Opc0).NumDefs;
277 Offset1Idx -= get(Opc1).NumDefs;
278 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
279 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
280 return true;
281 }
282
283 if (isSMRD(Opc0) && isSMRD(Opc1)) {
284 // Skip time and cache invalidation instructions.
285 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
286 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
287 return false;
288
289 unsigned NumOps = getNumOperandsNoGlue(Load0);
290 if (NumOps != getNumOperandsNoGlue(Load1))
291 return false;
292
293 // Check base reg.
294 if (Load0->getOperand(0) != Load1->getOperand(0))
295 return false;
296
297 // Match register offsets, if both register and immediate offsets present.
298 assert(NumOps == 4 || NumOps == 5);
299 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
300 return false;
301
302 const ConstantSDNode *Load0Offset =
303 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
304 const ConstantSDNode *Load1Offset =
305 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
306
307 if (!Load0Offset || !Load1Offset)
308 return false;
309
310 Offset0 = Load0Offset->getZExtValue();
311 Offset1 = Load1Offset->getZExtValue();
312 return true;
313 }
314
315 // MUBUF and MTBUF can access the same addresses.
316 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
317
318 // MUBUF and MTBUF have vaddr at different indices.
319 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
320 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
321 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
322 return false;
323
324 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
325 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
326
327 if (OffIdx0 == -1 || OffIdx1 == -1)
328 return false;
329
330 // getNamedOperandIdx returns the index for MachineInstrs. Since they
331 // include the output in the operand list, but SDNodes don't, we need to
332 // subtract the index by one.
333 OffIdx0 -= get(Opc0).NumDefs;
334 OffIdx1 -= get(Opc1).NumDefs;
335
336 SDValue Off0 = Load0->getOperand(OffIdx0);
337 SDValue Off1 = Load1->getOperand(OffIdx1);
338
339 // The offset might be a FrameIndexSDNode.
340 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
341 return false;
342
343 Offset0 = Off0->getAsZExtVal();
344 Offset1 = Off1->getAsZExtVal();
345 return true;
346 }
347
348 return false;
349}
350
351static bool isStride64(unsigned Opc) {
352 switch (Opc) {
353 case AMDGPU::DS_READ2ST64_B32:
354 case AMDGPU::DS_READ2ST64_B64:
355 case AMDGPU::DS_WRITE2ST64_B32:
356 case AMDGPU::DS_WRITE2ST64_B64:
357 return true;
358 default:
359 return false;
360 }
361}
362
365 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
366 const TargetRegisterInfo *TRI) const {
367 if (!LdSt.mayLoadOrStore())
368 return false;
369
370 unsigned Opc = LdSt.getOpcode();
371 OffsetIsScalable = false;
372 const MachineOperand *BaseOp, *OffsetOp;
373 int DataOpIdx;
374
375 if (isDS(LdSt)) {
376 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
377 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
378 if (OffsetOp) {
379 // Normal, single offset LDS instruction.
380 if (!BaseOp) {
381 // DS_CONSUME/DS_APPEND use M0 for the base address.
382 // TODO: find the implicit use operand for M0 and use that as BaseOp?
383 return false;
384 }
385 BaseOps.push_back(BaseOp);
386 Offset = OffsetOp->getImm();
387 // Get appropriate operand, and compute width accordingly.
388 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
389 if (DataOpIdx == -1)
390 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
391 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
392 Width = LocationSize::precise(64);
393 else
394 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
395 } else {
396 // The 2 offset instructions use offset0 and offset1 instead. We can treat
397 // these as a load with a single offset if the 2 offsets are consecutive.
398 // We will use this for some partially aligned loads.
399 const MachineOperand *Offset0Op =
400 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
401 const MachineOperand *Offset1Op =
402 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
403
404 unsigned Offset0 = Offset0Op->getImm() & 0xff;
405 unsigned Offset1 = Offset1Op->getImm() & 0xff;
406 if (Offset0 + 1 != Offset1)
407 return false;
408
409 // Each of these offsets is in element sized units, so we need to convert
410 // to bytes of the individual reads.
411
412 unsigned EltSize;
413 if (LdSt.mayLoad())
414 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
415 else {
416 assert(LdSt.mayStore());
417 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
418 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
419 }
420
421 if (isStride64(Opc))
422 EltSize *= 64;
423
424 BaseOps.push_back(BaseOp);
425 Offset = EltSize * Offset0;
426 // Get appropriate operand(s), and compute width accordingly.
427 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
428 if (DataOpIdx == -1) {
429 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
430 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
431 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
432 Width = LocationSize::precise(
433 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
434 } else {
435 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
436 }
437 }
438 return true;
439 }
440
441 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
442 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
443 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
444 return false;
445 BaseOps.push_back(RSrc);
446 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
447 if (BaseOp && !BaseOp->isFI())
448 BaseOps.push_back(BaseOp);
449 const MachineOperand *OffsetImm =
450 getNamedOperand(LdSt, AMDGPU::OpName::offset);
451 Offset = OffsetImm->getImm();
452 const MachineOperand *SOffset =
453 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
454 if (SOffset) {
455 if (SOffset->isReg())
456 BaseOps.push_back(SOffset);
457 else
458 Offset += SOffset->getImm();
459 }
460 // Get appropriate operand, and compute width accordingly.
461 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
462 if (DataOpIdx == -1)
463 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
464 if (DataOpIdx == -1) // LDS DMA
465 return false;
466 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
467 return true;
468 }
469
470 if (isImage(LdSt)) {
471 auto RsrcOpName =
472 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
473 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
474 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
475 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
476 if (VAddr0Idx >= 0) {
477 // GFX10 possible NSA encoding.
478 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
479 BaseOps.push_back(&LdSt.getOperand(I));
480 } else {
481 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
482 }
483 Offset = 0;
484 // Get appropriate operand, and compute width accordingly.
485 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
486 if (DataOpIdx == -1)
487 return false; // no return sampler
488 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
489 return true;
490 }
491
492 if (isSMRD(LdSt)) {
493 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
494 if (!BaseOp) // e.g. S_MEMTIME
495 return false;
496 BaseOps.push_back(BaseOp);
497 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
498 Offset = OffsetOp ? OffsetOp->getImm() : 0;
499 // Get appropriate operand, and compute width accordingly.
500 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
501 if (DataOpIdx == -1)
502 return false;
503 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
504 return true;
505 }
506
507 if (isFLAT(LdSt)) {
508 // Instructions have either vaddr or saddr or both or none.
509 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
510 if (BaseOp)
511 BaseOps.push_back(BaseOp);
512 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
513 if (BaseOp)
514 BaseOps.push_back(BaseOp);
515 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
516 // Get appropriate operand, and compute width accordingly.
517 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
518 if (DataOpIdx == -1)
519 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
520 if (DataOpIdx == -1) // LDS DMA
521 return false;
522 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
523 return true;
524 }
525
526 return false;
527}
528
529static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
531 const MachineInstr &MI2,
533 // Only examine the first "base" operand of each instruction, on the
534 // assumption that it represents the real base address of the memory access.
535 // Other operands are typically offsets or indices from this base address.
536 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
537 return true;
538
539 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
540 return false;
541
542 auto *MO1 = *MI1.memoperands_begin();
543 auto *MO2 = *MI2.memoperands_begin();
544 if (MO1->getAddrSpace() != MO2->getAddrSpace())
545 return false;
546
547 const auto *Base1 = MO1->getValue();
548 const auto *Base2 = MO2->getValue();
549 if (!Base1 || !Base2)
550 return false;
551 Base1 = getUnderlyingObject(Base1);
552 Base2 = getUnderlyingObject(Base2);
553
554 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
555 return false;
556
557 return Base1 == Base2;
558}
559
561 int64_t Offset1, bool OffsetIsScalable1,
563 int64_t Offset2, bool OffsetIsScalable2,
564 unsigned ClusterSize,
565 unsigned NumBytes) const {
566 // If the mem ops (to be clustered) do not have the same base ptr, then they
567 // should not be clustered
568 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
569 if (!BaseOps1.empty() && !BaseOps2.empty()) {
570 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
571 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
572 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
573 return false;
574
575 const SIMachineFunctionInfo *MFI =
576 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
577 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
578 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
579 // If only one base op is empty, they do not have the same base ptr
580 return false;
581 }
582
583 // In order to avoid register pressure, on an average, the number of DWORDS
584 // loaded together by all clustered mem ops should not exceed
585 // MaxMemoryClusterDWords. This is an empirical value based on certain
586 // observations and performance related experiments.
587 // The good thing about this heuristic is - it avoids clustering of too many
588 // sub-word loads, and also avoids clustering of wide loads. Below is the
589 // brief summary of how the heuristic behaves for various `LoadSize` when
590 // MaxMemoryClusterDWords is 8.
591 //
592 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
593 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
594 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
595 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
596 // (5) LoadSize >= 17: do not cluster
597 const unsigned LoadSize = NumBytes / ClusterSize;
598 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
599 return NumDWords <= MaxMemoryClusterDWords;
600}
601
602// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
603// the first 16 loads will be interleaved with the stores, and the next 16 will
604// be clustered as expected. It should really split into 2 16 store batches.
605//
606// Loads are clustered until this returns false, rather than trying to schedule
607// groups of stores. This also means we have to deal with saying different
608// address space loads should be clustered, and ones which might cause bank
609// conflicts.
610//
611// This might be deprecated so it might not be worth that much effort to fix.
613 int64_t Offset0, int64_t Offset1,
614 unsigned NumLoads) const {
615 assert(Offset1 > Offset0 &&
616 "Second offset should be larger than first offset!");
617 // If we have less than 16 loads in a row, and the offsets are within 64
618 // bytes, then schedule together.
619
620 // A cacheline is 64 bytes (for global memory).
621 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
622}
623
626 const DebugLoc &DL, MCRegister DestReg,
627 MCRegister SrcReg, bool KillSrc,
628 const char *Msg = "illegal VGPR to SGPR copy") {
630
632 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
633
634 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
635 .addReg(SrcReg, getKillRegState(KillSrc));
636}
637
638/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
639/// possible to have a direct copy in these cases on GFX908, so an intermediate
640/// VGPR copy is required.
644 const DebugLoc &DL, MCRegister DestReg,
645 MCRegister SrcReg, bool KillSrc,
646 RegScavenger &RS, bool RegsOverlap,
647 Register ImpDefSuperReg = Register(),
648 Register ImpUseSuperReg = Register()) {
649 assert((TII.getSubtarget().hasMAIInsts() &&
650 !TII.getSubtarget().hasGFX90AInsts()) &&
651 "Expected GFX908 subtarget.");
652
653 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
654 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
655 "Source register of the copy should be either an SGPR or an AGPR.");
656
657 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
658 "Destination register of the copy should be an AGPR.");
659
660 const SIRegisterInfo &RI = TII.getRegisterInfo();
661
662 // First try to find defining accvgpr_write to avoid temporary registers.
663 // In the case of copies of overlapping AGPRs, we conservatively do not
664 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
665 // an accvgpr_write used for this same copy due to implicit-defs
666 if (!RegsOverlap) {
667 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
668 --Def;
669
670 if (!Def->modifiesRegister(SrcReg, &RI))
671 continue;
672
673 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
674 Def->getOperand(0).getReg() != SrcReg)
675 break;
676
677 MachineOperand &DefOp = Def->getOperand(1);
678 assert(DefOp.isReg() || DefOp.isImm());
679
680 if (DefOp.isReg()) {
681 bool SafeToPropagate = true;
682 // Check that register source operand is not clobbered before MI.
683 // Immediate operands are always safe to propagate.
684 for (auto I = Def; I != MI && SafeToPropagate; ++I)
685 if (I->modifiesRegister(DefOp.getReg(), &RI))
686 SafeToPropagate = false;
687
688 if (!SafeToPropagate)
689 break;
690
691 for (auto I = Def; I != MI; ++I)
692 I->clearRegisterKills(DefOp.getReg(), &RI);
693 }
694
695 MachineInstrBuilder Builder =
696 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
697 .add(DefOp);
698 if (ImpDefSuperReg)
699 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
700
701 if (ImpUseSuperReg) {
702 Builder.addReg(ImpUseSuperReg,
704 }
705
706 return;
707 }
708 }
709
711 RS.backward(std::next(MI));
712
713 // Ideally we want to have three registers for a long reg_sequence copy
714 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
715 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
716 *MBB.getParent());
717
718 // Registers in the sequence are allocated contiguously so we can just
719 // use register number to pick one of three round-robin temps.
720 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
721 Register Tmp =
722 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
724 "VGPR used for an intermediate copy should have been reserved.");
725
726 // Only loop through if there are any free registers left. We don't want to
727 // spill.
728 while (RegNo--) {
729 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
730 /* RestoreAfter */ false, 0,
731 /* AllowSpill */ false);
732 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
733 break;
734 Tmp = Tmp2;
735 RS.setRegUsed(Tmp);
736 }
737
738 // Insert copy to temporary VGPR.
739 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
740 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
741 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
742 } else {
743 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
744 }
745
746 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
747 .addReg(SrcReg, getKillRegState(KillSrc));
748 if (ImpUseSuperReg) {
749 UseBuilder.addReg(ImpUseSuperReg,
751 }
752
753 MachineInstrBuilder DefBuilder
754 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
755 .addReg(Tmp, RegState::Kill);
756
757 if (ImpDefSuperReg)
758 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
759}
760
763 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
764 const TargetRegisterClass *RC, bool Forward) {
765 const SIRegisterInfo &RI = TII.getRegisterInfo();
766 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
768 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
769
770 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
771 int16_t SubIdx = BaseIndices[Idx];
772 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
773 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
774 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
775 unsigned Opcode = AMDGPU::S_MOV_B32;
776
777 // Is SGPR aligned? If so try to combine with next.
778 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
779 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
780 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
781 // Can use SGPR64 copy
782 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
783 SubIdx = RI.getSubRegFromChannel(Channel, 2);
784 DestSubReg = RI.getSubReg(DestReg, SubIdx);
785 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
786 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
787 Opcode = AMDGPU::S_MOV_B64;
788 Idx++;
789 }
790
791 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
792 .addReg(SrcSubReg)
793 .addReg(SrcReg, RegState::Implicit);
794
795 if (!FirstMI)
796 FirstMI = LastMI;
797
798 if (!Forward)
799 I--;
800 }
801
802 assert(FirstMI && LastMI);
803 if (!Forward)
804 std::swap(FirstMI, LastMI);
805
806 FirstMI->addOperand(
807 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
808
809 if (KillSrc)
810 LastMI->addRegisterKilled(SrcReg, &RI);
811}
812
815 const DebugLoc &DL, Register DestReg,
816 Register SrcReg, bool KillSrc, bool RenamableDest,
817 bool RenamableSrc) const {
818 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
819 unsigned Size = RI.getRegSizeInBits(*RC);
820 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
821 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
822
823 // The rest of copyPhysReg assumes Src and Dst size are the same size.
824 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
825 // we remove Fix16BitCopies and this code block?
826 if (Fix16BitCopies) {
827 if (((Size == 16) != (SrcSize == 16))) {
828 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
830 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
831 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
832 RegToFix = SubReg;
833
834 if (DestReg == SrcReg) {
835 // Identity copy. Insert empty bundle since ExpandPostRA expects an
836 // instruction here.
837 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
838 return;
839 }
840 RC = RI.getPhysRegBaseClass(DestReg);
841 Size = RI.getRegSizeInBits(*RC);
842 SrcRC = RI.getPhysRegBaseClass(SrcReg);
843 SrcSize = RI.getRegSizeInBits(*SrcRC);
844 }
845 }
846
847 if (RC == &AMDGPU::VGPR_32RegClass) {
848 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
849 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
850 AMDGPU::AGPR_32RegClass.contains(SrcReg));
851 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
852 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
853 BuildMI(MBB, MI, DL, get(Opc), DestReg)
854 .addReg(SrcReg, getKillRegState(KillSrc));
855 return;
856 }
857
858 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
859 RC == &AMDGPU::SReg_32RegClass) {
860 if (SrcReg == AMDGPU::SCC) {
861 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
862 .addImm(1)
863 .addImm(0);
864 return;
865 }
866
867 if (DestReg == AMDGPU::VCC_LO) {
868 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
869 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
870 .addReg(SrcReg, getKillRegState(KillSrc));
871 } else {
872 // FIXME: Hack until VReg_1 removed.
873 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
874 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
875 .addImm(0)
876 .addReg(SrcReg, getKillRegState(KillSrc));
877 }
878
879 return;
880 }
881
882 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
883 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
884 return;
885 }
886
887 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
888 .addReg(SrcReg, getKillRegState(KillSrc));
889 return;
890 }
891
892 if (RC == &AMDGPU::SReg_64RegClass) {
893 if (SrcReg == AMDGPU::SCC) {
894 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
895 .addImm(1)
896 .addImm(0);
897 return;
898 }
899
900 if (DestReg == AMDGPU::VCC) {
901 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
902 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
903 .addReg(SrcReg, getKillRegState(KillSrc));
904 } else {
905 // FIXME: Hack until VReg_1 removed.
906 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
907 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
908 .addImm(0)
909 .addReg(SrcReg, getKillRegState(KillSrc));
910 }
911
912 return;
913 }
914
915 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
916 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
917 return;
918 }
919
920 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
921 .addReg(SrcReg, getKillRegState(KillSrc));
922 return;
923 }
924
925 if (DestReg == AMDGPU::SCC) {
926 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
927 // but SelectionDAG emits such copies for i1 sources.
928 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
929 // This copy can only be produced by patterns
930 // with explicit SCC, which are known to be enabled
931 // only for subtargets with S_CMP_LG_U64 present.
933 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
934 .addReg(SrcReg, getKillRegState(KillSrc))
935 .addImm(0);
936 } else {
937 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
938 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
939 .addReg(SrcReg, getKillRegState(KillSrc))
940 .addImm(0);
941 }
942
943 return;
944 }
945
946 if (RC == &AMDGPU::AGPR_32RegClass) {
947 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
948 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
949 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
950 .addReg(SrcReg, getKillRegState(KillSrc));
951 return;
952 }
953
954 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
955 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
956 .addReg(SrcReg, getKillRegState(KillSrc));
957 return;
958 }
959
960 // FIXME: Pass should maintain scavenger to avoid scan through the block on
961 // every AGPR spill.
962 RegScavenger RS;
963 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
964 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
965 return;
966 }
967
968 if (Size == 16) {
969 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
970 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
971 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
972
973 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
974 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
975 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
976 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
977 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
978 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
979 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
980 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
981
982 if (IsSGPRDst) {
983 if (!IsSGPRSrc) {
984 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
985 return;
986 }
987
988 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
989 .addReg(NewSrcReg, getKillRegState(KillSrc));
990 return;
991 }
992
993 if (IsAGPRDst || IsAGPRSrc) {
994 if (!DstLow || !SrcLow) {
995 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
996 "Cannot use hi16 subreg with an AGPR!");
997 }
998
999 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
1000 return;
1001 }
1002
1003 if (ST.useRealTrue16Insts()) {
1004 if (IsSGPRSrc) {
1005 assert(SrcLow);
1006 SrcReg = NewSrcReg;
1007 }
1008 // Use the smaller instruction encoding if possible.
1009 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1010 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1011 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1012 .addReg(SrcReg);
1013 } else {
1014 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1015 .addImm(0) // src0_modifiers
1016 .addReg(SrcReg)
1017 .addImm(0); // op_sel
1018 }
1019 return;
1020 }
1021
1022 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1023 if (!DstLow || !SrcLow) {
1024 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1025 "Cannot use hi16 subreg on VI!");
1026 }
1027
1028 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1029 .addReg(NewSrcReg, getKillRegState(KillSrc));
1030 return;
1031 }
1032
1033 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1034 .addImm(0) // src0_modifiers
1035 .addReg(NewSrcReg)
1036 .addImm(0) // clamp
1043 // First implicit operand is $exec.
1044 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1045 return;
1046 }
1047
1048 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1049 if (ST.hasMovB64()) {
1050 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1051 .addReg(SrcReg, getKillRegState(KillSrc));
1052 return;
1053 }
1054 if (ST.hasPkMovB32()) {
1055 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1057 .addReg(SrcReg)
1059 .addReg(SrcReg)
1060 .addImm(0) // op_sel_lo
1061 .addImm(0) // op_sel_hi
1062 .addImm(0) // neg_lo
1063 .addImm(0) // neg_hi
1064 .addImm(0) // clamp
1065 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1066 return;
1067 }
1068 }
1069
1070 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1071 if (RI.isSGPRClass(RC)) {
1072 if (!RI.isSGPRClass(SrcRC)) {
1073 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1074 return;
1075 }
1076 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1077 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1078 Forward);
1079 return;
1080 }
1081
1082 unsigned EltSize = 4;
1083 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1084 if (RI.isAGPRClass(RC)) {
1085 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1086 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1087 else if (RI.hasVGPRs(SrcRC) ||
1088 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1089 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1090 else
1091 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1092 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1093 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1094 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1095 (RI.isProperlyAlignedRC(*RC) &&
1096 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1097 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1098 if (ST.hasMovB64()) {
1099 Opcode = AMDGPU::V_MOV_B64_e32;
1100 EltSize = 8;
1101 } else if (ST.hasPkMovB32()) {
1102 Opcode = AMDGPU::V_PK_MOV_B32;
1103 EltSize = 8;
1104 }
1105 }
1106
1107 // For the cases where we need an intermediate instruction/temporary register
1108 // (destination is an AGPR), we need a scavenger.
1109 //
1110 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1111 // whole block for every handled copy.
1112 std::unique_ptr<RegScavenger> RS;
1113 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1114 RS = std::make_unique<RegScavenger>();
1115
1116 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1117
1118 // If there is an overlap, we can't kill the super-register on the last
1119 // instruction, since it will also kill the components made live by this def.
1120 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1121 const bool CanKillSuperReg = KillSrc && !Overlap;
1122
1123 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1124 unsigned SubIdx;
1125 if (Forward)
1126 SubIdx = SubIndices[Idx];
1127 else
1128 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1129 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1130 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1131 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1132
1133 bool IsFirstSubreg = Idx == 0;
1134 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1135
1136 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1137 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1138 Register ImpUseSuper = SrcReg;
1139 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1140 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1141 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1143 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1145 .addReg(SrcSubReg)
1147 .addReg(SrcSubReg)
1148 .addImm(0) // op_sel_lo
1149 .addImm(0) // op_sel_hi
1150 .addImm(0) // neg_lo
1151 .addImm(0) // neg_hi
1152 .addImm(0) // clamp
1153 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1154 if (IsFirstSubreg)
1156 } else {
1157 MachineInstrBuilder Builder =
1158 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1159 if (IsFirstSubreg)
1160 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1161
1162 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1163 }
1164 }
1165}
1166
1167int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1168 int NewOpc;
1169
1170 // Try to map original to commuted opcode
1171 NewOpc = AMDGPU::getCommuteRev(Opcode);
1172 if (NewOpc != -1)
1173 // Check if the commuted (REV) opcode exists on the target.
1174 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1175
1176 // Try to map commuted to original opcode
1177 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1178 if (NewOpc != -1)
1179 // Check if the original (non-REV) opcode exists on the target.
1180 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1181
1182 return Opcode;
1183}
1184
1185const TargetRegisterClass *
1187 return &AMDGPU::VGPR_32RegClass;
1188}
1189
1192 const DebugLoc &DL, Register DstReg,
1194 Register TrueReg,
1195 Register FalseReg) const {
1197 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1198 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1199 "Not a VGPR32 reg");
1200
1201 if (Cond.size() == 1) {
1202 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1203 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1204 .add(Cond[0]);
1205 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1206 .addImm(0)
1207 .addReg(FalseReg)
1208 .addImm(0)
1209 .addReg(TrueReg)
1210 .addReg(SReg);
1211 } else if (Cond.size() == 2) {
1212 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1213 switch (Cond[0].getImm()) {
1214 case SIInstrInfo::SCC_TRUE: {
1215 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1216 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1217 : AMDGPU::S_CSELECT_B64), SReg)
1218 .addImm(1)
1219 .addImm(0);
1220 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1221 .addImm(0)
1222 .addReg(FalseReg)
1223 .addImm(0)
1224 .addReg(TrueReg)
1225 .addReg(SReg);
1226 break;
1227 }
1228 case SIInstrInfo::SCC_FALSE: {
1229 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1230 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1231 : AMDGPU::S_CSELECT_B64), SReg)
1232 .addImm(0)
1233 .addImm(1);
1234 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1235 .addImm(0)
1236 .addReg(FalseReg)
1237 .addImm(0)
1238 .addReg(TrueReg)
1239 .addReg(SReg);
1240 break;
1241 }
1242 case SIInstrInfo::VCCNZ: {
1243 MachineOperand RegOp = Cond[1];
1244 RegOp.setImplicit(false);
1245 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1246 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1247 .add(RegOp);
1248 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1249 .addImm(0)
1250 .addReg(FalseReg)
1251 .addImm(0)
1252 .addReg(TrueReg)
1253 .addReg(SReg);
1254 break;
1255 }
1256 case SIInstrInfo::VCCZ: {
1257 MachineOperand RegOp = Cond[1];
1258 RegOp.setImplicit(false);
1259 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1260 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1261 .add(RegOp);
1262 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1263 .addImm(0)
1264 .addReg(TrueReg)
1265 .addImm(0)
1266 .addReg(FalseReg)
1267 .addReg(SReg);
1268 break;
1269 }
1270 case SIInstrInfo::EXECNZ: {
1271 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1272 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1273 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1274 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1275 .addImm(0);
1276 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1277 : AMDGPU::S_CSELECT_B64), SReg)
1278 .addImm(1)
1279 .addImm(0);
1280 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1281 .addImm(0)
1282 .addReg(FalseReg)
1283 .addImm(0)
1284 .addReg(TrueReg)
1285 .addReg(SReg);
1286 break;
1287 }
1288 case SIInstrInfo::EXECZ: {
1289 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1290 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1291 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1292 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1293 .addImm(0);
1294 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1295 : AMDGPU::S_CSELECT_B64), SReg)
1296 .addImm(0)
1297 .addImm(1);
1298 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1299 .addImm(0)
1300 .addReg(FalseReg)
1301 .addImm(0)
1302 .addReg(TrueReg)
1303 .addReg(SReg);
1304 llvm_unreachable("Unhandled branch predicate EXECZ");
1305 break;
1306 }
1307 default:
1308 llvm_unreachable("invalid branch predicate");
1309 }
1310 } else {
1311 llvm_unreachable("Can only handle Cond size 1 or 2");
1312 }
1313}
1314
1317 const DebugLoc &DL,
1318 Register SrcReg, int Value) const {
1320 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1321 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1322 .addImm(Value)
1323 .addReg(SrcReg);
1324
1325 return Reg;
1326}
1327
1330 const DebugLoc &DL,
1331 Register SrcReg, int Value) const {
1333 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1334 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1335 .addImm(Value)
1336 .addReg(SrcReg);
1337
1338 return Reg;
1339}
1340
1342 const Register Reg,
1343 int64_t &ImmVal) const {
1344 switch (MI.getOpcode()) {
1345 case AMDGPU::V_MOV_B32_e32:
1346 case AMDGPU::S_MOV_B32:
1347 case AMDGPU::S_MOVK_I32:
1348 case AMDGPU::S_MOV_B64:
1349 case AMDGPU::V_MOV_B64_e32:
1350 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1351 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1352 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1353 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1354 case AMDGPU::V_MOV_B64_PSEUDO: {
1355 const MachineOperand &Src0 = MI.getOperand(1);
1356 if (Src0.isImm()) {
1357 ImmVal = Src0.getImm();
1358 return MI.getOperand(0).getReg() == Reg;
1359 }
1360
1361 return false;
1362 }
1363 case AMDGPU::S_BREV_B32:
1364 case AMDGPU::V_BFREV_B32_e32:
1365 case AMDGPU::V_BFREV_B32_e64: {
1366 const MachineOperand &Src0 = MI.getOperand(1);
1367 if (Src0.isImm()) {
1368 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1369 return MI.getOperand(0).getReg() == Reg;
1370 }
1371
1372 return false;
1373 }
1374 case AMDGPU::S_NOT_B32:
1375 case AMDGPU::V_NOT_B32_e32:
1376 case AMDGPU::V_NOT_B32_e64: {
1377 const MachineOperand &Src0 = MI.getOperand(1);
1378 if (Src0.isImm()) {
1379 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1380 return MI.getOperand(0).getReg() == Reg;
1381 }
1382
1383 return false;
1384 }
1385 default:
1386 return false;
1387 }
1388}
1389
1391
1392 if (RI.isAGPRClass(DstRC))
1393 return AMDGPU::COPY;
1394 if (RI.getRegSizeInBits(*DstRC) == 16) {
1395 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1396 // before RA.
1397 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1398 }
1399 if (RI.getRegSizeInBits(*DstRC) == 32)
1400 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1401 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1402 return AMDGPU::S_MOV_B64;
1403 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1404 return AMDGPU::V_MOV_B64_PSEUDO;
1405 return AMDGPU::COPY;
1406}
1407
1408const MCInstrDesc &
1410 bool IsIndirectSrc) const {
1411 if (IsIndirectSrc) {
1412 if (VecSize <= 32) // 4 bytes
1413 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1414 if (VecSize <= 64) // 8 bytes
1415 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1416 if (VecSize <= 96) // 12 bytes
1417 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1418 if (VecSize <= 128) // 16 bytes
1419 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1420 if (VecSize <= 160) // 20 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1422 if (VecSize <= 256) // 32 bytes
1423 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1424 if (VecSize <= 288) // 36 bytes
1425 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1426 if (VecSize <= 320) // 40 bytes
1427 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1428 if (VecSize <= 352) // 44 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1430 if (VecSize <= 384) // 48 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1432 if (VecSize <= 512) // 64 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1434 if (VecSize <= 1024) // 128 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1436
1437 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1438 }
1439
1440 if (VecSize <= 32) // 4 bytes
1441 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1442 if (VecSize <= 64) // 8 bytes
1443 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1444 if (VecSize <= 96) // 12 bytes
1445 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1446 if (VecSize <= 128) // 16 bytes
1447 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1448 if (VecSize <= 160) // 20 bytes
1449 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1450 if (VecSize <= 256) // 32 bytes
1451 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1452 if (VecSize <= 288) // 36 bytes
1453 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1454 if (VecSize <= 320) // 40 bytes
1455 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1456 if (VecSize <= 352) // 44 bytes
1457 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1458 if (VecSize <= 384) // 48 bytes
1459 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1460 if (VecSize <= 512) // 64 bytes
1461 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1462 if (VecSize <= 1024) // 128 bytes
1463 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1464
1465 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1466}
1467
1468static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1469 if (VecSize <= 32) // 4 bytes
1470 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1471 if (VecSize <= 64) // 8 bytes
1472 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1473 if (VecSize <= 96) // 12 bytes
1474 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1475 if (VecSize <= 128) // 16 bytes
1476 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1477 if (VecSize <= 160) // 20 bytes
1478 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1479 if (VecSize <= 256) // 32 bytes
1480 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1481 if (VecSize <= 288) // 36 bytes
1482 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1483 if (VecSize <= 320) // 40 bytes
1484 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1485 if (VecSize <= 352) // 44 bytes
1486 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1487 if (VecSize <= 384) // 48 bytes
1488 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1489 if (VecSize <= 512) // 64 bytes
1490 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1491 if (VecSize <= 1024) // 128 bytes
1492 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1493
1494 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1495}
1496
1497static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1498 if (VecSize <= 32) // 4 bytes
1499 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1500 if (VecSize <= 64) // 8 bytes
1501 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1502 if (VecSize <= 96) // 12 bytes
1503 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1504 if (VecSize <= 128) // 16 bytes
1505 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1506 if (VecSize <= 160) // 20 bytes
1507 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1508 if (VecSize <= 256) // 32 bytes
1509 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1510 if (VecSize <= 288) // 36 bytes
1511 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1512 if (VecSize <= 320) // 40 bytes
1513 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1514 if (VecSize <= 352) // 44 bytes
1515 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1516 if (VecSize <= 384) // 48 bytes
1517 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1518 if (VecSize <= 512) // 64 bytes
1519 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1520 if (VecSize <= 1024) // 128 bytes
1521 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1522
1523 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1524}
1525
1526static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1527 if (VecSize <= 64) // 8 bytes
1528 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1529 if (VecSize <= 128) // 16 bytes
1530 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1531 if (VecSize <= 256) // 32 bytes
1532 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1533 if (VecSize <= 512) // 64 bytes
1534 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1535 if (VecSize <= 1024) // 128 bytes
1536 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1537
1538 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1539}
1540
1541const MCInstrDesc &
1542SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1543 bool IsSGPR) const {
1544 if (IsSGPR) {
1545 switch (EltSize) {
1546 case 32:
1547 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1548 case 64:
1549 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1550 default:
1551 llvm_unreachable("invalid reg indexing elt size");
1552 }
1553 }
1554
1555 assert(EltSize == 32 && "invalid reg indexing elt size");
1557}
1558
1559static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1560 switch (Size) {
1561 case 4:
1562 return AMDGPU::SI_SPILL_S32_SAVE;
1563 case 8:
1564 return AMDGPU::SI_SPILL_S64_SAVE;
1565 case 12:
1566 return AMDGPU::SI_SPILL_S96_SAVE;
1567 case 16:
1568 return AMDGPU::SI_SPILL_S128_SAVE;
1569 case 20:
1570 return AMDGPU::SI_SPILL_S160_SAVE;
1571 case 24:
1572 return AMDGPU::SI_SPILL_S192_SAVE;
1573 case 28:
1574 return AMDGPU::SI_SPILL_S224_SAVE;
1575 case 32:
1576 return AMDGPU::SI_SPILL_S256_SAVE;
1577 case 36:
1578 return AMDGPU::SI_SPILL_S288_SAVE;
1579 case 40:
1580 return AMDGPU::SI_SPILL_S320_SAVE;
1581 case 44:
1582 return AMDGPU::SI_SPILL_S352_SAVE;
1583 case 48:
1584 return AMDGPU::SI_SPILL_S384_SAVE;
1585 case 64:
1586 return AMDGPU::SI_SPILL_S512_SAVE;
1587 case 128:
1588 return AMDGPU::SI_SPILL_S1024_SAVE;
1589 default:
1590 llvm_unreachable("unknown register size");
1591 }
1592}
1593
1594static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1595 switch (Size) {
1596 case 2:
1597 return AMDGPU::SI_SPILL_V16_SAVE;
1598 case 4:
1599 return AMDGPU::SI_SPILL_V32_SAVE;
1600 case 8:
1601 return AMDGPU::SI_SPILL_V64_SAVE;
1602 case 12:
1603 return AMDGPU::SI_SPILL_V96_SAVE;
1604 case 16:
1605 return AMDGPU::SI_SPILL_V128_SAVE;
1606 case 20:
1607 return AMDGPU::SI_SPILL_V160_SAVE;
1608 case 24:
1609 return AMDGPU::SI_SPILL_V192_SAVE;
1610 case 28:
1611 return AMDGPU::SI_SPILL_V224_SAVE;
1612 case 32:
1613 return AMDGPU::SI_SPILL_V256_SAVE;
1614 case 36:
1615 return AMDGPU::SI_SPILL_V288_SAVE;
1616 case 40:
1617 return AMDGPU::SI_SPILL_V320_SAVE;
1618 case 44:
1619 return AMDGPU::SI_SPILL_V352_SAVE;
1620 case 48:
1621 return AMDGPU::SI_SPILL_V384_SAVE;
1622 case 64:
1623 return AMDGPU::SI_SPILL_V512_SAVE;
1624 case 128:
1625 return AMDGPU::SI_SPILL_V1024_SAVE;
1626 default:
1627 llvm_unreachable("unknown register size");
1628 }
1629}
1630
1631static unsigned getAVSpillSaveOpcode(unsigned Size) {
1632 switch (Size) {
1633 case 4:
1634 return AMDGPU::SI_SPILL_AV32_SAVE;
1635 case 8:
1636 return AMDGPU::SI_SPILL_AV64_SAVE;
1637 case 12:
1638 return AMDGPU::SI_SPILL_AV96_SAVE;
1639 case 16:
1640 return AMDGPU::SI_SPILL_AV128_SAVE;
1641 case 20:
1642 return AMDGPU::SI_SPILL_AV160_SAVE;
1643 case 24:
1644 return AMDGPU::SI_SPILL_AV192_SAVE;
1645 case 28:
1646 return AMDGPU::SI_SPILL_AV224_SAVE;
1647 case 32:
1648 return AMDGPU::SI_SPILL_AV256_SAVE;
1649 case 36:
1650 return AMDGPU::SI_SPILL_AV288_SAVE;
1651 case 40:
1652 return AMDGPU::SI_SPILL_AV320_SAVE;
1653 case 44:
1654 return AMDGPU::SI_SPILL_AV352_SAVE;
1655 case 48:
1656 return AMDGPU::SI_SPILL_AV384_SAVE;
1657 case 64:
1658 return AMDGPU::SI_SPILL_AV512_SAVE;
1659 case 128:
1660 return AMDGPU::SI_SPILL_AV1024_SAVE;
1661 default:
1662 llvm_unreachable("unknown register size");
1663 }
1664}
1665
1666static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1667 bool IsVectorSuperClass) {
1668 // Currently, there is only 32-bit WWM register spills needed.
1669 if (Size != 4)
1670 llvm_unreachable("unknown wwm register spill size");
1671
1672 if (IsVectorSuperClass)
1673 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1674
1675 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1676}
1677
1679 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1680 const SIMachineFunctionInfo &MFI) const {
1681 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1682
1683 // Choose the right opcode if spilling a WWM register.
1685 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1686
1687 // TODO: Check if AGPRs are available
1688 if (ST.hasMAIInsts())
1689 return getAVSpillSaveOpcode(Size);
1690
1692}
1693
1696 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1697 const TargetRegisterInfo *TRI, Register VReg,
1698 MachineInstr::MIFlag Flags) const {
1701 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1702 const DebugLoc &DL = MBB.findDebugLoc(MI);
1703
1704 MachinePointerInfo PtrInfo
1705 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1707 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1708 FrameInfo.getObjectAlign(FrameIndex));
1709 unsigned SpillSize = TRI->getSpillSize(*RC);
1710
1712 if (RI.isSGPRClass(RC)) {
1713 MFI->setHasSpilledSGPRs();
1714 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1715 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1716 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1717
1718 // We are only allowed to create one new instruction when spilling
1719 // registers, so we need to use pseudo instruction for spilling SGPRs.
1720 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1721
1722 // The SGPR spill/restore instructions only work on number sgprs, so we need
1723 // to make sure we are using the correct register class.
1724 if (SrcReg.isVirtual() && SpillSize == 4) {
1725 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1726 }
1727
1728 BuildMI(MBB, MI, DL, OpDesc)
1729 .addReg(SrcReg, getKillRegState(isKill)) // data
1730 .addFrameIndex(FrameIndex) // addr
1731 .addMemOperand(MMO)
1733
1734 if (RI.spillSGPRToVGPR())
1735 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1736 return;
1737 }
1738
1739 unsigned Opcode =
1740 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1741 MFI->setHasSpilledVGPRs();
1742
1743 BuildMI(MBB, MI, DL, get(Opcode))
1744 .addReg(SrcReg, getKillRegState(isKill)) // data
1745 .addFrameIndex(FrameIndex) // addr
1746 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1747 .addImm(0) // offset
1748 .addMemOperand(MMO);
1749}
1750
1751static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1752 switch (Size) {
1753 case 4:
1754 return AMDGPU::SI_SPILL_S32_RESTORE;
1755 case 8:
1756 return AMDGPU::SI_SPILL_S64_RESTORE;
1757 case 12:
1758 return AMDGPU::SI_SPILL_S96_RESTORE;
1759 case 16:
1760 return AMDGPU::SI_SPILL_S128_RESTORE;
1761 case 20:
1762 return AMDGPU::SI_SPILL_S160_RESTORE;
1763 case 24:
1764 return AMDGPU::SI_SPILL_S192_RESTORE;
1765 case 28:
1766 return AMDGPU::SI_SPILL_S224_RESTORE;
1767 case 32:
1768 return AMDGPU::SI_SPILL_S256_RESTORE;
1769 case 36:
1770 return AMDGPU::SI_SPILL_S288_RESTORE;
1771 case 40:
1772 return AMDGPU::SI_SPILL_S320_RESTORE;
1773 case 44:
1774 return AMDGPU::SI_SPILL_S352_RESTORE;
1775 case 48:
1776 return AMDGPU::SI_SPILL_S384_RESTORE;
1777 case 64:
1778 return AMDGPU::SI_SPILL_S512_RESTORE;
1779 case 128:
1780 return AMDGPU::SI_SPILL_S1024_RESTORE;
1781 default:
1782 llvm_unreachable("unknown register size");
1783 }
1784}
1785
1786static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1787 switch (Size) {
1788 case 2:
1789 return AMDGPU::SI_SPILL_V16_RESTORE;
1790 case 4:
1791 return AMDGPU::SI_SPILL_V32_RESTORE;
1792 case 8:
1793 return AMDGPU::SI_SPILL_V64_RESTORE;
1794 case 12:
1795 return AMDGPU::SI_SPILL_V96_RESTORE;
1796 case 16:
1797 return AMDGPU::SI_SPILL_V128_RESTORE;
1798 case 20:
1799 return AMDGPU::SI_SPILL_V160_RESTORE;
1800 case 24:
1801 return AMDGPU::SI_SPILL_V192_RESTORE;
1802 case 28:
1803 return AMDGPU::SI_SPILL_V224_RESTORE;
1804 case 32:
1805 return AMDGPU::SI_SPILL_V256_RESTORE;
1806 case 36:
1807 return AMDGPU::SI_SPILL_V288_RESTORE;
1808 case 40:
1809 return AMDGPU::SI_SPILL_V320_RESTORE;
1810 case 44:
1811 return AMDGPU::SI_SPILL_V352_RESTORE;
1812 case 48:
1813 return AMDGPU::SI_SPILL_V384_RESTORE;
1814 case 64:
1815 return AMDGPU::SI_SPILL_V512_RESTORE;
1816 case 128:
1817 return AMDGPU::SI_SPILL_V1024_RESTORE;
1818 default:
1819 llvm_unreachable("unknown register size");
1820 }
1821}
1822
1823static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1824 switch (Size) {
1825 case 4:
1826 return AMDGPU::SI_SPILL_AV32_RESTORE;
1827 case 8:
1828 return AMDGPU::SI_SPILL_AV64_RESTORE;
1829 case 12:
1830 return AMDGPU::SI_SPILL_AV96_RESTORE;
1831 case 16:
1832 return AMDGPU::SI_SPILL_AV128_RESTORE;
1833 case 20:
1834 return AMDGPU::SI_SPILL_AV160_RESTORE;
1835 case 24:
1836 return AMDGPU::SI_SPILL_AV192_RESTORE;
1837 case 28:
1838 return AMDGPU::SI_SPILL_AV224_RESTORE;
1839 case 32:
1840 return AMDGPU::SI_SPILL_AV256_RESTORE;
1841 case 36:
1842 return AMDGPU::SI_SPILL_AV288_RESTORE;
1843 case 40:
1844 return AMDGPU::SI_SPILL_AV320_RESTORE;
1845 case 44:
1846 return AMDGPU::SI_SPILL_AV352_RESTORE;
1847 case 48:
1848 return AMDGPU::SI_SPILL_AV384_RESTORE;
1849 case 64:
1850 return AMDGPU::SI_SPILL_AV512_RESTORE;
1851 case 128:
1852 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1853 default:
1854 llvm_unreachable("unknown register size");
1855 }
1856}
1857
1858static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1859 bool IsVectorSuperClass) {
1860 // Currently, there is only 32-bit WWM register spills needed.
1861 if (Size != 4)
1862 llvm_unreachable("unknown wwm register spill size");
1863
1864 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1865 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1866
1867 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1868}
1869
1871 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1872 const SIMachineFunctionInfo &MFI) const {
1873 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1874
1875 // Choose the right opcode if restoring a WWM register.
1877 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1878
1879 // TODO: Check if AGPRs are available
1880 if (ST.hasMAIInsts())
1882
1883 assert(!RI.isAGPRClass(RC));
1885}
1886
1889 Register DestReg, int FrameIndex,
1890 const TargetRegisterClass *RC,
1891 const TargetRegisterInfo *TRI,
1892 Register VReg,
1893 MachineInstr::MIFlag Flags) const {
1896 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1897 const DebugLoc &DL = MBB.findDebugLoc(MI);
1898 unsigned SpillSize = TRI->getSpillSize(*RC);
1899
1900 MachinePointerInfo PtrInfo
1901 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1902
1904 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1905 FrameInfo.getObjectAlign(FrameIndex));
1906
1907 if (RI.isSGPRClass(RC)) {
1908 MFI->setHasSpilledSGPRs();
1909 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1910 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1911 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1912
1913 // FIXME: Maybe this should not include a memoperand because it will be
1914 // lowered to non-memory instructions.
1915 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1916 if (DestReg.isVirtual() && SpillSize == 4) {
1918 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1919 }
1920
1921 if (RI.spillSGPRToVGPR())
1922 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1923 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1924 .addFrameIndex(FrameIndex) // addr
1925 .addMemOperand(MMO)
1927
1928 return;
1929 }
1930
1931 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1932 SpillSize, *MFI);
1933 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1934 .addFrameIndex(FrameIndex) // vaddr
1935 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1936 .addImm(0) // offset
1937 .addMemOperand(MMO);
1938}
1939
1942 insertNoops(MBB, MI, 1);
1943}
1944
1947 unsigned Quantity) const {
1949 while (Quantity > 0) {
1950 unsigned Arg = std::min(Quantity, 8u);
1951 Quantity -= Arg;
1952 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1953 }
1954}
1955
1957 auto *MF = MBB.getParent();
1959
1960 assert(Info->isEntryFunction());
1961
1962 if (MBB.succ_empty()) {
1963 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1964 if (HasNoTerminator) {
1965 if (Info->returnsVoid()) {
1966 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1967 } else {
1968 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1969 }
1970 }
1971 }
1972}
1973
1977 const DebugLoc &DL) const {
1979 constexpr unsigned DoorbellIDMask = 0x3ff;
1980 constexpr unsigned ECQueueWaveAbort = 0x400;
1981
1982 MachineBasicBlock *TrapBB = &MBB;
1983 MachineBasicBlock *ContBB = &MBB;
1984 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1985
1986 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1987 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1988 TrapBB = MF->CreateMachineBasicBlock();
1989 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1990 MF->push_back(TrapBB);
1991 MBB.addSuccessor(TrapBB);
1992 }
1993
1994 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1995 // will be a nop.
1996 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1997 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1998 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1999 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2000 DoorbellReg)
2002 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2003 .addUse(AMDGPU::M0);
2004 Register DoorbellRegMasked =
2005 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2006 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2007 .addUse(DoorbellReg)
2008 .addImm(DoorbellIDMask);
2009 Register SetWaveAbortBit =
2010 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2011 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2012 .addUse(DoorbellRegMasked)
2013 .addImm(ECQueueWaveAbort);
2014 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2015 .addUse(SetWaveAbortBit);
2016 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2018 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2019 .addUse(AMDGPU::TTMP2);
2020 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2021 TrapBB->addSuccessor(HaltLoopBB);
2022
2023 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2024 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2025 .addMBB(HaltLoopBB);
2026 MF->push_back(HaltLoopBB);
2027 HaltLoopBB->addSuccessor(HaltLoopBB);
2028
2029 return ContBB;
2030}
2031
2033 switch (MI.getOpcode()) {
2034 default:
2035 if (MI.isMetaInstruction())
2036 return 0;
2037 return 1; // FIXME: Do wait states equal cycles?
2038
2039 case AMDGPU::S_NOP:
2040 return MI.getOperand(0).getImm() + 1;
2041 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2042 // hazard, even if one exist, won't really be visible. Should we handle it?
2043 }
2044}
2045
2047 MachineBasicBlock &MBB = *MI.getParent();
2049 switch (MI.getOpcode()) {
2050 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2051 case AMDGPU::S_MOV_B64_term:
2052 // This is only a terminator to get the correct spill code placement during
2053 // register allocation.
2054 MI.setDesc(get(AMDGPU::S_MOV_B64));
2055 break;
2056
2057 case AMDGPU::S_MOV_B32_term:
2058 // This is only a terminator to get the correct spill code placement during
2059 // register allocation.
2060 MI.setDesc(get(AMDGPU::S_MOV_B32));
2061 break;
2062
2063 case AMDGPU::S_XOR_B64_term:
2064 // This is only a terminator to get the correct spill code placement during
2065 // register allocation.
2066 MI.setDesc(get(AMDGPU::S_XOR_B64));
2067 break;
2068
2069 case AMDGPU::S_XOR_B32_term:
2070 // This is only a terminator to get the correct spill code placement during
2071 // register allocation.
2072 MI.setDesc(get(AMDGPU::S_XOR_B32));
2073 break;
2074 case AMDGPU::S_OR_B64_term:
2075 // This is only a terminator to get the correct spill code placement during
2076 // register allocation.
2077 MI.setDesc(get(AMDGPU::S_OR_B64));
2078 break;
2079 case AMDGPU::S_OR_B32_term:
2080 // This is only a terminator to get the correct spill code placement during
2081 // register allocation.
2082 MI.setDesc(get(AMDGPU::S_OR_B32));
2083 break;
2084
2085 case AMDGPU::S_ANDN2_B64_term:
2086 // This is only a terminator to get the correct spill code placement during
2087 // register allocation.
2088 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2089 break;
2090
2091 case AMDGPU::S_ANDN2_B32_term:
2092 // This is only a terminator to get the correct spill code placement during
2093 // register allocation.
2094 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2095 break;
2096
2097 case AMDGPU::S_AND_B64_term:
2098 // This is only a terminator to get the correct spill code placement during
2099 // register allocation.
2100 MI.setDesc(get(AMDGPU::S_AND_B64));
2101 break;
2102
2103 case AMDGPU::S_AND_B32_term:
2104 // This is only a terminator to get the correct spill code placement during
2105 // register allocation.
2106 MI.setDesc(get(AMDGPU::S_AND_B32));
2107 break;
2108
2109 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2110 // This is only a terminator to get the correct spill code placement during
2111 // register allocation.
2112 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2113 break;
2114
2115 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2116 // This is only a terminator to get the correct spill code placement during
2117 // register allocation.
2118 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2119 break;
2120
2121 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2122 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2123 break;
2124
2125 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2126 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2127 MI.getMF()->getRegInfo().constrainRegClass(MI.getOperand(0).getReg(),
2128 &AMDGPU::SReg_32_XM0RegClass);
2129 break;
2130 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2131 Register Dst = MI.getOperand(0).getReg();
2132 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2133 MI.setDesc(
2134 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2135 break;
2136 }
2137 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2138 Register Dst = MI.getOperand(0).getReg();
2139 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2140 int64_t Imm = MI.getOperand(1).getImm();
2141
2142 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2143 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2144 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2145 .addImm(SignExtend64<32>(Imm))
2147 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2148 .addImm(SignExtend64<32>(Imm >> 32))
2150 MI.eraseFromParent();
2151 break;
2152 }
2153
2154 [[fallthrough]];
2155 }
2156 case AMDGPU::V_MOV_B64_PSEUDO: {
2157 Register Dst = MI.getOperand(0).getReg();
2158 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2159 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2160
2161 const MachineOperand &SrcOp = MI.getOperand(1);
2162 // FIXME: Will this work for 64-bit floating point immediates?
2163 assert(!SrcOp.isFPImm());
2164 if (ST.hasMovB64()) {
2165 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2166 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2167 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2168 break;
2169 }
2170 if (SrcOp.isImm()) {
2171 APInt Imm(64, SrcOp.getImm());
2172 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2173 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2174 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2175 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2177 .addImm(Lo.getSExtValue())
2179 .addImm(Lo.getSExtValue())
2180 .addImm(0) // op_sel_lo
2181 .addImm(0) // op_sel_hi
2182 .addImm(0) // neg_lo
2183 .addImm(0) // neg_hi
2184 .addImm(0); // clamp
2185 } else {
2186 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2187 .addImm(Lo.getSExtValue())
2189 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2190 .addImm(Hi.getSExtValue())
2192 }
2193 } else {
2194 assert(SrcOp.isReg());
2195 if (ST.hasPkMovB32() &&
2196 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2197 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2198 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2199 .addReg(SrcOp.getReg())
2201 .addReg(SrcOp.getReg())
2202 .addImm(0) // op_sel_lo
2203 .addImm(0) // op_sel_hi
2204 .addImm(0) // neg_lo
2205 .addImm(0) // neg_hi
2206 .addImm(0); // clamp
2207 } else {
2208 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2209 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2211 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2212 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2214 }
2215 }
2216 MI.eraseFromParent();
2217 break;
2218 }
2219 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2221 break;
2222 }
2223 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2224 const MachineOperand &SrcOp = MI.getOperand(1);
2225 assert(!SrcOp.isFPImm());
2226
2227 if (ST.has64BitLiterals()) {
2228 MI.setDesc(get(AMDGPU::S_MOV_B64));
2229 break;
2230 }
2231
2232 APInt Imm(64, SrcOp.getImm());
2233 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2234 MI.setDesc(get(AMDGPU::S_MOV_B64));
2235 break;
2236 }
2237
2238 Register Dst = MI.getOperand(0).getReg();
2239 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2240 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2241
2242 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2243 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2244 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2245 .addImm(Lo.getSExtValue())
2247 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2248 .addImm(Hi.getSExtValue())
2250 MI.eraseFromParent();
2251 break;
2252 }
2253 case AMDGPU::V_SET_INACTIVE_B32: {
2254 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2255 Register DstReg = MI.getOperand(0).getReg();
2256 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2257 .add(MI.getOperand(3))
2258 .add(MI.getOperand(4))
2259 .add(MI.getOperand(1))
2260 .add(MI.getOperand(2))
2261 .add(MI.getOperand(5));
2262 MI.eraseFromParent();
2263 break;
2264 }
2265 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2266 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2267 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2268 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2269 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2270 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2271 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2272 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2273 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2274 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2275 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2276 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2277 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2278 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2279 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2280 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2281 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2282 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2283 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2284 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2285 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2286 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2287 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2288 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2289 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2290 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2291 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2292 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2293 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2294 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2295
2296 unsigned Opc;
2297 if (RI.hasVGPRs(EltRC)) {
2298 Opc = AMDGPU::V_MOVRELD_B32_e32;
2299 } else {
2300 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2301 : AMDGPU::S_MOVRELD_B32;
2302 }
2303
2304 const MCInstrDesc &OpDesc = get(Opc);
2305 Register VecReg = MI.getOperand(0).getReg();
2306 bool IsUndef = MI.getOperand(1).isUndef();
2307 unsigned SubReg = MI.getOperand(3).getImm();
2308 assert(VecReg == MI.getOperand(1).getReg());
2309
2311 BuildMI(MBB, MI, DL, OpDesc)
2312 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2313 .add(MI.getOperand(2))
2315 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2316
2317 const int ImpDefIdx =
2318 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2319 const int ImpUseIdx = ImpDefIdx + 1;
2320 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2321 MI.eraseFromParent();
2322 break;
2323 }
2324 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2325 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2326 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2327 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2328 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2329 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2330 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2331 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2332 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2333 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2334 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2335 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2337 Register VecReg = MI.getOperand(0).getReg();
2338 bool IsUndef = MI.getOperand(1).isUndef();
2339 MachineOperand &Idx = MI.getOperand(3);
2340 Register SubReg = MI.getOperand(4).getImm();
2341
2342 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2343 .add(Idx)
2345 SetOn->getOperand(3).setIsUndef();
2346
2347 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2349 BuildMI(MBB, MI, DL, OpDesc)
2350 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2351 .add(MI.getOperand(2))
2353 .addReg(VecReg,
2354 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2355
2356 const int ImpDefIdx =
2357 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2358 const int ImpUseIdx = ImpDefIdx + 1;
2359 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2360
2361 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2362
2363 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2364
2365 MI.eraseFromParent();
2366 break;
2367 }
2368 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2369 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2370 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2371 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2372 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2373 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2374 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2375 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2376 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2377 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2378 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2379 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2381 Register Dst = MI.getOperand(0).getReg();
2382 Register VecReg = MI.getOperand(1).getReg();
2383 bool IsUndef = MI.getOperand(1).isUndef();
2384 Register Idx = MI.getOperand(2).getReg();
2385 Register SubReg = MI.getOperand(3).getImm();
2386
2387 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2388 .addReg(Idx)
2390 SetOn->getOperand(3).setIsUndef();
2391
2392 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2393 .addDef(Dst)
2394 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2395 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2396
2397 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2398
2399 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2400
2401 MI.eraseFromParent();
2402 break;
2403 }
2404 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2405 MachineFunction &MF = *MBB.getParent();
2406 Register Reg = MI.getOperand(0).getReg();
2407 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2408 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2409 MachineOperand OpLo = MI.getOperand(1);
2410 MachineOperand OpHi = MI.getOperand(2);
2411
2412 // Create a bundle so these instructions won't be re-ordered by the
2413 // post-RA scheduler.
2414 MIBundleBuilder Bundler(MBB, MI);
2415 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2416
2417 // What we want here is an offset from the value returned by s_getpc (which
2418 // is the address of the s_add_u32 instruction) to the global variable, but
2419 // since the encoding of $symbol starts 4 bytes after the start of the
2420 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2421 // small. This requires us to add 4 to the global variable offset in order
2422 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2423 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2424 // instruction.
2425
2426 int64_t Adjust = 0;
2427 if (ST.hasGetPCZeroExtension()) {
2428 // Fix up hardware that does not sign-extend the 48-bit PC value by
2429 // inserting: s_sext_i32_i16 reghi, reghi
2430 Bundler.append(
2431 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2432 Adjust += 4;
2433 }
2434
2435 if (OpLo.isGlobal())
2436 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2437 Bundler.append(
2438 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2439
2440 if (OpHi.isGlobal())
2441 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2442 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2443 .addReg(RegHi)
2444 .add(OpHi));
2445
2446 finalizeBundle(MBB, Bundler.begin());
2447
2448 MI.eraseFromParent();
2449 break;
2450 }
2451 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2452 MachineFunction &MF = *MBB.getParent();
2453 Register Reg = MI.getOperand(0).getReg();
2454 MachineOperand Op = MI.getOperand(1);
2455
2456 // Create a bundle so these instructions won't be re-ordered by the
2457 // post-RA scheduler.
2458 MIBundleBuilder Bundler(MBB, MI);
2459 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2460 if (Op.isGlobal())
2461 Op.setOffset(Op.getOffset() + 4);
2462 Bundler.append(
2463 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2464
2465 finalizeBundle(MBB, Bundler.begin());
2466
2467 MI.eraseFromParent();
2468 break;
2469 }
2470 case AMDGPU::ENTER_STRICT_WWM: {
2471 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2472 // Whole Wave Mode is entered.
2473 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2474 : AMDGPU::S_OR_SAVEEXEC_B64));
2475 break;
2476 }
2477 case AMDGPU::ENTER_STRICT_WQM: {
2478 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2479 // STRICT_WQM is entered.
2480 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2481 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2482 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2483 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2484 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2485
2486 MI.eraseFromParent();
2487 break;
2488 }
2489 case AMDGPU::EXIT_STRICT_WWM:
2490 case AMDGPU::EXIT_STRICT_WQM: {
2491 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2492 // WWM/STICT_WQM is exited.
2493 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2494 break;
2495 }
2496 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2497 case AMDGPU::SI_RETURN: {
2498 const MachineFunction *MF = MBB.getParent();
2499 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2500 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2501 // Hiding the return address use with SI_RETURN may lead to extra kills in
2502 // the function and missing live-ins. We are fine in practice because callee
2503 // saved register handling ensures the register value is restored before
2504 // RET, but we need the undef flag here to appease the MachineVerifier
2505 // liveness checks.
2507 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2508 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2509
2510 MIB.copyImplicitOps(MI);
2511 MI.eraseFromParent();
2512 break;
2513 }
2514
2515 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2516 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2517 MI.setDesc(get(AMDGPU::S_MUL_U64));
2518 break;
2519
2520 case AMDGPU::S_GETPC_B64_pseudo:
2521 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2522 if (ST.hasGetPCZeroExtension()) {
2523 Register Dst = MI.getOperand(0).getReg();
2524 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2525 // Fix up hardware that does not sign-extend the 48-bit PC value by
2526 // inserting: s_sext_i32_i16 dsthi, dsthi
2527 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2528 DstHi)
2529 .addReg(DstHi);
2530 }
2531 break;
2532
2533 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2535 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2536 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2537 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2538 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2539 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2540 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2541 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2542 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2543 break;
2544 }
2545
2546 return true;
2547}
2548
2551 unsigned SubIdx, const MachineInstr &Orig,
2552 const TargetRegisterInfo &RI) const {
2553
2554 // Try shrinking the instruction to remat only the part needed for current
2555 // context.
2556 // TODO: Handle more cases.
2557 unsigned Opcode = Orig.getOpcode();
2558 switch (Opcode) {
2559 case AMDGPU::S_LOAD_DWORDX16_IMM:
2560 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2561 if (SubIdx != 0)
2562 break;
2563
2564 if (I == MBB.end())
2565 break;
2566
2567 if (I->isBundled())
2568 break;
2569
2570 // Look for a single use of the register that is also a subreg.
2571 Register RegToFind = Orig.getOperand(0).getReg();
2572 MachineOperand *UseMO = nullptr;
2573 for (auto &CandMO : I->operands()) {
2574 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2575 continue;
2576 if (UseMO) {
2577 UseMO = nullptr;
2578 break;
2579 }
2580 UseMO = &CandMO;
2581 }
2582 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2583 break;
2584
2585 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2586 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2587
2590 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2591
2592 unsigned NewOpcode = -1;
2593 if (SubregSize == 256)
2594 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2595 else if (SubregSize == 128)
2596 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2597 else
2598 break;
2599
2600 const MCInstrDesc &TID = get(NewOpcode);
2601 const TargetRegisterClass *NewRC =
2602 RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2603 MRI.setRegClass(DestReg, NewRC);
2604
2605 UseMO->setReg(DestReg);
2606 UseMO->setSubReg(AMDGPU::NoSubRegister);
2607
2608 // Use a smaller load with the desired size, possibly with updated offset.
2609 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2610 MI->setDesc(TID);
2611 MI->getOperand(0).setReg(DestReg);
2612 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2613 if (Offset) {
2614 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2615 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2616 OffsetMO->setImm(FinalOffset);
2617 }
2619 for (const MachineMemOperand *MemOp : Orig.memoperands())
2620 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2621 SubregSize / 8));
2622 MI->setMemRefs(*MF, NewMMOs);
2623
2624 MBB.insert(I, MI);
2625 return;
2626 }
2627
2628 default:
2629 break;
2630 }
2631
2632 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2633}
2634
2635std::pair<MachineInstr*, MachineInstr*>
2637 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2638
2639 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2641 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2642 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2643 return std::pair(&MI, nullptr);
2644 }
2645
2646 MachineBasicBlock &MBB = *MI.getParent();
2650 Register Dst = MI.getOperand(0).getReg();
2651 unsigned Part = 0;
2652 MachineInstr *Split[2];
2653
2654 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2655 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2656 if (Dst.isPhysical()) {
2657 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2658 } else {
2659 assert(MRI.isSSA());
2660 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2661 MovDPP.addDef(Tmp);
2662 }
2663
2664 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2665 const MachineOperand &SrcOp = MI.getOperand(I);
2666 assert(!SrcOp.isFPImm());
2667 if (SrcOp.isImm()) {
2668 APInt Imm(64, SrcOp.getImm());
2669 Imm.ashrInPlace(Part * 32);
2670 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2671 } else {
2672 assert(SrcOp.isReg());
2673 Register Src = SrcOp.getReg();
2674 if (Src.isPhysical())
2675 MovDPP.addReg(RI.getSubReg(Src, Sub));
2676 else
2677 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2678 }
2679 }
2680
2681 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2682 MovDPP.addImm(MO.getImm());
2683
2684 Split[Part] = MovDPP;
2685 ++Part;
2686 }
2687
2688 if (Dst.isVirtual())
2689 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2690 .addReg(Split[0]->getOperand(0).getReg())
2691 .addImm(AMDGPU::sub0)
2692 .addReg(Split[1]->getOperand(0).getReg())
2693 .addImm(AMDGPU::sub1);
2694
2695 MI.eraseFromParent();
2696 return std::pair(Split[0], Split[1]);
2697}
2698
2699std::optional<DestSourcePair>
2701 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2702 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2703
2704 return std::nullopt;
2705}
2706
2708 AMDGPU::OpName Src0OpName,
2709 MachineOperand &Src1,
2710 AMDGPU::OpName Src1OpName) const {
2711 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2712 if (!Src0Mods)
2713 return false;
2714
2715 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2716 assert(Src1Mods &&
2717 "All commutable instructions have both src0 and src1 modifiers");
2718
2719 int Src0ModsVal = Src0Mods->getImm();
2720 int Src1ModsVal = Src1Mods->getImm();
2721
2722 Src1Mods->setImm(Src0ModsVal);
2723 Src0Mods->setImm(Src1ModsVal);
2724 return true;
2725}
2726
2728 MachineOperand &RegOp,
2729 MachineOperand &NonRegOp) {
2730 Register Reg = RegOp.getReg();
2731 unsigned SubReg = RegOp.getSubReg();
2732 bool IsKill = RegOp.isKill();
2733 bool IsDead = RegOp.isDead();
2734 bool IsUndef = RegOp.isUndef();
2735 bool IsDebug = RegOp.isDebug();
2736
2737 if (NonRegOp.isImm())
2738 RegOp.ChangeToImmediate(NonRegOp.getImm());
2739 else if (NonRegOp.isFI())
2740 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2741 else if (NonRegOp.isGlobal()) {
2742 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2743 NonRegOp.getTargetFlags());
2744 } else
2745 return nullptr;
2746
2747 // Make sure we don't reinterpret a subreg index in the target flags.
2748 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2749
2750 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2751 NonRegOp.setSubReg(SubReg);
2752
2753 return &MI;
2754}
2755
2757 MachineOperand &NonRegOp1,
2758 MachineOperand &NonRegOp2) {
2759 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2760 int64_t NonRegVal = NonRegOp1.getImm();
2761
2762 NonRegOp1.setImm(NonRegOp2.getImm());
2763 NonRegOp2.setImm(NonRegVal);
2764 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2765 NonRegOp2.setTargetFlags(TargetFlags);
2766 return &MI;
2767}
2768
2769bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2770 unsigned OpIdx1) const {
2771 const MCInstrDesc &InstDesc = MI.getDesc();
2772 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2773 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2774
2775 unsigned Opc = MI.getOpcode();
2776 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2777
2778 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2779 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2780
2781 // Swap doesn't breach constant bus or literal limits
2782 // It may move literal to position other than src0, this is not allowed
2783 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2784 // FIXME: After gfx9, literal can be in place other than Src0
2785 if (isVALU(MI)) {
2786 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2787 !isInlineConstant(MO0, OpInfo1))
2788 return false;
2789 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2790 !isInlineConstant(MO1, OpInfo0))
2791 return false;
2792 }
2793
2794 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2795 if (OpInfo1.RegClass == -1)
2796 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2797 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2798 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2799 }
2800 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2801 if (OpInfo0.RegClass == -1)
2802 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2803 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2804 isLegalRegOperand(MI, OpIdx0, MO1);
2805 }
2806
2807 // No need to check 64-bit literals since swapping does not bring new
2808 // 64-bit literals into current instruction to fold to 32-bit
2809
2810 return isImmOperandLegal(MI, OpIdx1, MO0);
2811}
2812
2814 unsigned Src0Idx,
2815 unsigned Src1Idx) const {
2816 assert(!NewMI && "this should never be used");
2817
2818 unsigned Opc = MI.getOpcode();
2819 int CommutedOpcode = commuteOpcode(Opc);
2820 if (CommutedOpcode == -1)
2821 return nullptr;
2822
2823 if (Src0Idx > Src1Idx)
2824 std::swap(Src0Idx, Src1Idx);
2825
2826 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2827 static_cast<int>(Src0Idx) &&
2828 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2829 static_cast<int>(Src1Idx) &&
2830 "inconsistency with findCommutedOpIndices");
2831
2832 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2833 return nullptr;
2834
2835 MachineInstr *CommutedMI = nullptr;
2836 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2837 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2838 if (Src0.isReg() && Src1.isReg()) {
2839 // Be sure to copy the source modifiers to the right place.
2840 CommutedMI =
2841 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2842 } else if (Src0.isReg() && !Src1.isReg()) {
2843 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2844 } else if (!Src0.isReg() && Src1.isReg()) {
2845 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2846 } else if (Src0.isImm() && Src1.isImm()) {
2847 CommutedMI = swapImmOperands(MI, Src0, Src1);
2848 } else {
2849 // FIXME: Found two non registers to commute. This does happen.
2850 return nullptr;
2851 }
2852
2853 if (CommutedMI) {
2854 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2855 Src1, AMDGPU::OpName::src1_modifiers);
2856
2857 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2858 AMDGPU::OpName::src1_sel);
2859
2860 CommutedMI->setDesc(get(CommutedOpcode));
2861 }
2862
2863 return CommutedMI;
2864}
2865
2866// This needs to be implemented because the source modifiers may be inserted
2867// between the true commutable operands, and the base
2868// TargetInstrInfo::commuteInstruction uses it.
2870 unsigned &SrcOpIdx0,
2871 unsigned &SrcOpIdx1) const {
2872 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2873}
2874
2876 unsigned &SrcOpIdx0,
2877 unsigned &SrcOpIdx1) const {
2878 if (!Desc.isCommutable())
2879 return false;
2880
2881 unsigned Opc = Desc.getOpcode();
2882 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2883 if (Src0Idx == -1)
2884 return false;
2885
2886 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2887 if (Src1Idx == -1)
2888 return false;
2889
2890 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2891}
2892
2894 int64_t BrOffset) const {
2895 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2896 // because its dest block is unanalyzable.
2897 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2898
2899 // Convert to dwords.
2900 BrOffset /= 4;
2901
2902 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2903 // from the next instruction.
2904 BrOffset -= 1;
2905
2906 return isIntN(BranchOffsetBits, BrOffset);
2907}
2908
2911 return MI.getOperand(0).getMBB();
2912}
2913
2915 for (const MachineInstr &MI : MBB->terminators()) {
2916 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2917 MI.getOpcode() == AMDGPU::SI_LOOP)
2918 return true;
2919 }
2920 return false;
2921}
2922
2924 MachineBasicBlock &DestBB,
2925 MachineBasicBlock &RestoreBB,
2926 const DebugLoc &DL, int64_t BrOffset,
2927 RegScavenger *RS) const {
2928 assert(MBB.empty() &&
2929 "new block should be inserted for expanding unconditional branch");
2930 assert(MBB.pred_size() == 1);
2931 assert(RestoreBB.empty() &&
2932 "restore block should be inserted for restoring clobbered registers");
2933
2937 auto I = MBB.end();
2938 auto &MCCtx = MF->getContext();
2939
2940 if (ST.hasAddPC64Inst()) {
2941 MCSymbol *Offset =
2942 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2943 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2945 MCSymbol *PostAddPCLabel =
2946 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2947 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2948 auto *OffsetExpr = MCBinaryExpr::createSub(
2949 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2950 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2951 Offset->setVariableValue(OffsetExpr);
2952 return;
2953 }
2954
2955 assert(RS && "RegScavenger required for long branching");
2956
2957 // FIXME: Virtual register workaround for RegScavenger not working with empty
2958 // blocks.
2959 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2960
2961 // Note: as this is used after hazard recognizer we need to apply some hazard
2962 // workarounds directly.
2963 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2965 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2966 if (FlushSGPRWrites)
2967 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2969 };
2970
2971 // We need to compute the offset relative to the instruction immediately after
2972 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2973 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2974 ApplyHazardWorkarounds();
2975
2976 MCSymbol *PostGetPCLabel =
2977 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2978 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2979
2980 MCSymbol *OffsetLo =
2981 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2982 MCSymbol *OffsetHi =
2983 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2984 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2985 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2986 .addReg(PCReg, 0, AMDGPU::sub0)
2987 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2988 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2989 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2990 .addReg(PCReg, 0, AMDGPU::sub1)
2991 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2992 ApplyHazardWorkarounds();
2993
2994 // Insert the indirect branch after the other terminator.
2995 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2996 .addReg(PCReg);
2997
2998 // If a spill is needed for the pc register pair, we need to insert a spill
2999 // restore block right before the destination block, and insert a short branch
3000 // into the old destination block's fallthrough predecessor.
3001 // e.g.:
3002 //
3003 // s_cbranch_scc0 skip_long_branch:
3004 //
3005 // long_branch_bb:
3006 // spill s[8:9]
3007 // s_getpc_b64 s[8:9]
3008 // s_add_u32 s8, s8, restore_bb
3009 // s_addc_u32 s9, s9, 0
3010 // s_setpc_b64 s[8:9]
3011 //
3012 // skip_long_branch:
3013 // foo;
3014 //
3015 // .....
3016 //
3017 // dest_bb_fallthrough_predecessor:
3018 // bar;
3019 // s_branch dest_bb
3020 //
3021 // restore_bb:
3022 // restore s[8:9]
3023 // fallthrough dest_bb
3024 ///
3025 // dest_bb:
3026 // buzz;
3027
3028 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3029 Register Scav;
3030
3031 // If we've previously reserved a register for long branches
3032 // avoid running the scavenger and just use those registers
3033 if (LongBranchReservedReg) {
3034 RS->enterBasicBlock(MBB);
3035 Scav = LongBranchReservedReg;
3036 } else {
3038 Scav = RS->scavengeRegisterBackwards(
3039 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3040 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3041 }
3042 if (Scav) {
3043 RS->setRegUsed(Scav);
3044 MRI.replaceRegWith(PCReg, Scav);
3045 MRI.clearVirtRegs();
3046 } else {
3047 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3048 // SGPR spill.
3049 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3050 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3051 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3052 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3053 MRI.clearVirtRegs();
3054 }
3055
3056 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3057 // Now, the distance could be defined.
3059 MCSymbolRefExpr::create(DestLabel, MCCtx),
3060 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3061 // Add offset assignments.
3062 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3063 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3064 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3065 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3066}
3067
3068unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3069 switch (Cond) {
3070 case SIInstrInfo::SCC_TRUE:
3071 return AMDGPU::S_CBRANCH_SCC1;
3072 case SIInstrInfo::SCC_FALSE:
3073 return AMDGPU::S_CBRANCH_SCC0;
3074 case SIInstrInfo::VCCNZ:
3075 return AMDGPU::S_CBRANCH_VCCNZ;
3076 case SIInstrInfo::VCCZ:
3077 return AMDGPU::S_CBRANCH_VCCZ;
3078 case SIInstrInfo::EXECNZ:
3079 return AMDGPU::S_CBRANCH_EXECNZ;
3080 case SIInstrInfo::EXECZ:
3081 return AMDGPU::S_CBRANCH_EXECZ;
3082 default:
3083 llvm_unreachable("invalid branch predicate");
3084 }
3085}
3086
3087SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3088 switch (Opcode) {
3089 case AMDGPU::S_CBRANCH_SCC0:
3090 return SCC_FALSE;
3091 case AMDGPU::S_CBRANCH_SCC1:
3092 return SCC_TRUE;
3093 case AMDGPU::S_CBRANCH_VCCNZ:
3094 return VCCNZ;
3095 case AMDGPU::S_CBRANCH_VCCZ:
3096 return VCCZ;
3097 case AMDGPU::S_CBRANCH_EXECNZ:
3098 return EXECNZ;
3099 case AMDGPU::S_CBRANCH_EXECZ:
3100 return EXECZ;
3101 default:
3102 return INVALID_BR;
3103 }
3104}
3105
3109 MachineBasicBlock *&FBB,
3111 bool AllowModify) const {
3112 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3113 // Unconditional Branch
3114 TBB = I->getOperand(0).getMBB();
3115 return false;
3116 }
3117
3118 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3119 if (Pred == INVALID_BR)
3120 return true;
3121
3122 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3123 Cond.push_back(MachineOperand::CreateImm(Pred));
3124 Cond.push_back(I->getOperand(1)); // Save the branch register.
3125
3126 ++I;
3127
3128 if (I == MBB.end()) {
3129 // Conditional branch followed by fall-through.
3130 TBB = CondBB;
3131 return false;
3132 }
3133
3134 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3135 TBB = CondBB;
3136 FBB = I->getOperand(0).getMBB();
3137 return false;
3138 }
3139
3140 return true;
3141}
3142
3144 MachineBasicBlock *&FBB,
3146 bool AllowModify) const {
3148 auto E = MBB.end();
3149 if (I == E)
3150 return false;
3151
3152 // Skip over the instructions that are artificially terminators for special
3153 // exec management.
3154 while (I != E && !I->isBranch() && !I->isReturn()) {
3155 switch (I->getOpcode()) {
3156 case AMDGPU::S_MOV_B64_term:
3157 case AMDGPU::S_XOR_B64_term:
3158 case AMDGPU::S_OR_B64_term:
3159 case AMDGPU::S_ANDN2_B64_term:
3160 case AMDGPU::S_AND_B64_term:
3161 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3162 case AMDGPU::S_MOV_B32_term:
3163 case AMDGPU::S_XOR_B32_term:
3164 case AMDGPU::S_OR_B32_term:
3165 case AMDGPU::S_ANDN2_B32_term:
3166 case AMDGPU::S_AND_B32_term:
3167 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3168 break;
3169 case AMDGPU::SI_IF:
3170 case AMDGPU::SI_ELSE:
3171 case AMDGPU::SI_KILL_I1_TERMINATOR:
3172 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3173 // FIXME: It's messy that these need to be considered here at all.
3174 return true;
3175 default:
3176 llvm_unreachable("unexpected non-branch terminator inst");
3177 }
3178
3179 ++I;
3180 }
3181
3182 if (I == E)
3183 return false;
3184
3185 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3186}
3187
3189 int *BytesRemoved) const {
3190 unsigned Count = 0;
3191 unsigned RemovedSize = 0;
3193 // Skip over artificial terminators when removing instructions.
3194 if (MI.isBranch() || MI.isReturn()) {
3195 RemovedSize += getInstSizeInBytes(MI);
3196 MI.eraseFromParent();
3197 ++Count;
3198 }
3199 }
3200
3201 if (BytesRemoved)
3202 *BytesRemoved = RemovedSize;
3203
3204 return Count;
3205}
3206
3207// Copy the flags onto the implicit condition register operand.
3209 const MachineOperand &OrigCond) {
3210 CondReg.setIsUndef(OrigCond.isUndef());
3211 CondReg.setIsKill(OrigCond.isKill());
3212}
3213
3216 MachineBasicBlock *FBB,
3218 const DebugLoc &DL,
3219 int *BytesAdded) const {
3220 if (!FBB && Cond.empty()) {
3221 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3222 .addMBB(TBB);
3223 if (BytesAdded)
3224 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3225 return 1;
3226 }
3227
3228 assert(TBB && Cond[0].isImm());
3229
3230 unsigned Opcode
3231 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3232
3233 if (!FBB) {
3234 MachineInstr *CondBr =
3235 BuildMI(&MBB, DL, get(Opcode))
3236 .addMBB(TBB);
3237
3238 // Copy the flags onto the implicit condition register operand.
3239 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3240 fixImplicitOperands(*CondBr);
3241
3242 if (BytesAdded)
3243 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3244 return 1;
3245 }
3246
3247 assert(TBB && FBB);
3248
3249 MachineInstr *CondBr =
3250 BuildMI(&MBB, DL, get(Opcode))
3251 .addMBB(TBB);
3252 fixImplicitOperands(*CondBr);
3253 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3254 .addMBB(FBB);
3255
3256 MachineOperand &CondReg = CondBr->getOperand(1);
3257 CondReg.setIsUndef(Cond[1].isUndef());
3258 CondReg.setIsKill(Cond[1].isKill());
3259
3260 if (BytesAdded)
3261 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3262
3263 return 2;
3264}
3265
3268 if (Cond.size() != 2) {
3269 return true;
3270 }
3271
3272 if (Cond[0].isImm()) {
3273 Cond[0].setImm(-Cond[0].getImm());
3274 return false;
3275 }
3276
3277 return true;
3278}
3279
3282 Register DstReg, Register TrueReg,
3283 Register FalseReg, int &CondCycles,
3284 int &TrueCycles, int &FalseCycles) const {
3285 switch (Cond[0].getImm()) {
3286 case VCCNZ:
3287 case VCCZ: {
3289 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3290 if (MRI.getRegClass(FalseReg) != RC)
3291 return false;
3292
3293 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3294 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3295
3296 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3297 return RI.hasVGPRs(RC) && NumInsts <= 6;
3298 }
3299 case SCC_TRUE:
3300 case SCC_FALSE: {
3301 // FIXME: We could insert for VGPRs if we could replace the original compare
3302 // with a vector one.
3304 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3305 if (MRI.getRegClass(FalseReg) != RC)
3306 return false;
3307
3308 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3309
3310 // Multiples of 8 can do s_cselect_b64
3311 if (NumInsts % 2 == 0)
3312 NumInsts /= 2;
3313
3314 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3315 return RI.isSGPRClass(RC);
3316 }
3317 default:
3318 return false;
3319 }
3320}
3321
3325 Register TrueReg, Register FalseReg) const {
3326 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3327 if (Pred == VCCZ || Pred == SCC_FALSE) {
3328 Pred = static_cast<BranchPredicate>(-Pred);
3329 std::swap(TrueReg, FalseReg);
3330 }
3331
3333 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3334 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3335
3336 if (DstSize == 32) {
3338 if (Pred == SCC_TRUE) {
3339 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3340 .addReg(TrueReg)
3341 .addReg(FalseReg);
3342 } else {
3343 // Instruction's operands are backwards from what is expected.
3344 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3345 .addReg(FalseReg)
3346 .addReg(TrueReg);
3347 }
3348
3349 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3350 return;
3351 }
3352
3353 if (DstSize == 64 && Pred == SCC_TRUE) {
3355 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3356 .addReg(TrueReg)
3357 .addReg(FalseReg);
3358
3359 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3360 return;
3361 }
3362
3363 static const int16_t Sub0_15[] = {
3364 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3365 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3366 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3367 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3368 };
3369
3370 static const int16_t Sub0_15_64[] = {
3371 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3372 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3373 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3374 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3375 };
3376
3377 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3378 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3379 const int16_t *SubIndices = Sub0_15;
3380 int NElts = DstSize / 32;
3381
3382 // 64-bit select is only available for SALU.
3383 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3384 if (Pred == SCC_TRUE) {
3385 if (NElts % 2) {
3386 SelOp = AMDGPU::S_CSELECT_B32;
3387 EltRC = &AMDGPU::SGPR_32RegClass;
3388 } else {
3389 SelOp = AMDGPU::S_CSELECT_B64;
3390 EltRC = &AMDGPU::SGPR_64RegClass;
3391 SubIndices = Sub0_15_64;
3392 NElts /= 2;
3393 }
3394 }
3395
3397 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3398
3399 I = MIB->getIterator();
3400
3402 for (int Idx = 0; Idx != NElts; ++Idx) {
3403 Register DstElt = MRI.createVirtualRegister(EltRC);
3404 Regs.push_back(DstElt);
3405
3406 unsigned SubIdx = SubIndices[Idx];
3407
3409 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3410 Select =
3411 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3412 .addReg(FalseReg, 0, SubIdx)
3413 .addReg(TrueReg, 0, SubIdx);
3414 } else {
3415 Select =
3416 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3417 .addReg(TrueReg, 0, SubIdx)
3418 .addReg(FalseReg, 0, SubIdx);
3419 }
3420
3421 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3423
3424 MIB.addReg(DstElt)
3425 .addImm(SubIdx);
3426 }
3427}
3428
3430 switch (MI.getOpcode()) {
3431 case AMDGPU::V_MOV_B16_t16_e32:
3432 case AMDGPU::V_MOV_B16_t16_e64:
3433 case AMDGPU::V_MOV_B32_e32:
3434 case AMDGPU::V_MOV_B32_e64:
3435 case AMDGPU::V_MOV_B64_PSEUDO:
3436 case AMDGPU::V_MOV_B64_e32:
3437 case AMDGPU::V_MOV_B64_e64:
3438 case AMDGPU::S_MOV_B32:
3439 case AMDGPU::S_MOV_B64:
3440 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3441 case AMDGPU::COPY:
3442 case AMDGPU::WWM_COPY:
3443 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3444 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3445 case AMDGPU::V_ACCVGPR_MOV_B32:
3446 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3447 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3448 return true;
3449 default:
3450 return false;
3451 }
3452}
3453
3454static constexpr AMDGPU::OpName ModifierOpNames[] = {
3455 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3456 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3457 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3458
3460 unsigned Opc = MI.getOpcode();
3461 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3462 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3463 if (Idx >= 0)
3464 MI.removeOperand(Idx);
3465 }
3466}
3467
3468std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3469 unsigned SubRegIndex) {
3470 switch (SubRegIndex) {
3471 case AMDGPU::NoSubRegister:
3472 return Imm;
3473 case AMDGPU::sub0:
3474 return SignExtend64<32>(Imm);
3475 case AMDGPU::sub1:
3476 return SignExtend64<32>(Imm >> 32);
3477 case AMDGPU::lo16:
3478 return SignExtend64<16>(Imm);
3479 case AMDGPU::hi16:
3480 return SignExtend64<16>(Imm >> 16);
3481 case AMDGPU::sub1_lo16:
3482 return SignExtend64<16>(Imm >> 32);
3483 case AMDGPU::sub1_hi16:
3484 return SignExtend64<16>(Imm >> 48);
3485 default:
3486 return std::nullopt;
3487 }
3488
3489 llvm_unreachable("covered subregister switch");
3490}
3491
3492static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3493 switch (Opc) {
3494 case AMDGPU::V_MAC_F16_e32:
3495 case AMDGPU::V_MAC_F16_e64:
3496 case AMDGPU::V_MAD_F16_e64:
3497 return AMDGPU::V_MADAK_F16;
3498 case AMDGPU::V_MAC_F32_e32:
3499 case AMDGPU::V_MAC_F32_e64:
3500 case AMDGPU::V_MAD_F32_e64:
3501 return AMDGPU::V_MADAK_F32;
3502 case AMDGPU::V_FMAC_F32_e32:
3503 case AMDGPU::V_FMAC_F32_e64:
3504 case AMDGPU::V_FMA_F32_e64:
3505 return AMDGPU::V_FMAAK_F32;
3506 case AMDGPU::V_FMAC_F16_e32:
3507 case AMDGPU::V_FMAC_F16_e64:
3508 case AMDGPU::V_FMAC_F16_t16_e64:
3509 case AMDGPU::V_FMAC_F16_fake16_e64:
3510 case AMDGPU::V_FMA_F16_e64:
3511 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3512 ? AMDGPU::V_FMAAK_F16_t16
3513 : AMDGPU::V_FMAAK_F16_fake16
3514 : AMDGPU::V_FMAAK_F16;
3515 case AMDGPU::V_FMAC_F64_e32:
3516 case AMDGPU::V_FMAC_F64_e64:
3517 case AMDGPU::V_FMA_F64_e64:
3518 return AMDGPU::V_FMAAK_F64;
3519 default:
3520 llvm_unreachable("invalid instruction");
3521 }
3522}
3523
3524static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3525 switch (Opc) {
3526 case AMDGPU::V_MAC_F16_e32:
3527 case AMDGPU::V_MAC_F16_e64:
3528 case AMDGPU::V_MAD_F16_e64:
3529 return AMDGPU::V_MADMK_F16;
3530 case AMDGPU::V_MAC_F32_e32:
3531 case AMDGPU::V_MAC_F32_e64:
3532 case AMDGPU::V_MAD_F32_e64:
3533 return AMDGPU::V_MADMK_F32;
3534 case AMDGPU::V_FMAC_F32_e32:
3535 case AMDGPU::V_FMAC_F32_e64:
3536 case AMDGPU::V_FMA_F32_e64:
3537 return AMDGPU::V_FMAMK_F32;
3538 case AMDGPU::V_FMAC_F16_e32:
3539 case AMDGPU::V_FMAC_F16_e64:
3540 case AMDGPU::V_FMAC_F16_t16_e64:
3541 case AMDGPU::V_FMAC_F16_fake16_e64:
3542 case AMDGPU::V_FMA_F16_e64:
3543 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3544 ? AMDGPU::V_FMAMK_F16_t16
3545 : AMDGPU::V_FMAMK_F16_fake16
3546 : AMDGPU::V_FMAMK_F16;
3547 case AMDGPU::V_FMAC_F64_e32:
3548 case AMDGPU::V_FMAC_F64_e64:
3549 case AMDGPU::V_FMA_F64_e64:
3550 return AMDGPU::V_FMAMK_F64;
3551 default:
3552 llvm_unreachable("invalid instruction");
3553 }
3554}
3555
3557 Register Reg, MachineRegisterInfo *MRI) const {
3558 if (!MRI->hasOneNonDBGUse(Reg))
3559 return false;
3560
3561 int64_t Imm;
3562 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3563 return false;
3564
3565 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3566
3567 unsigned Opc = UseMI.getOpcode();
3568 if (Opc == AMDGPU::COPY) {
3569 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3570
3571 Register DstReg = UseMI.getOperand(0).getReg();
3572 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3573
3574 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3575
3576 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3577 RI.getSubRegIdxSize(UseSubReg) == 16;
3578
3579 if (Is16Bit) {
3580 if (RI.hasVGPRs(DstRC))
3581 return false; // Do not clobber vgpr_hi16
3582
3583 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3584 return false;
3585 }
3586
3587 MachineFunction *MF = UseMI.getMF();
3588
3589 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3590 MCRegister MovDstPhysReg =
3591 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3592
3593 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3594
3595 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3596 for (unsigned MovOp :
3597 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3598 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3599 const MCInstrDesc &MovDesc = get(MovOp);
3600
3601 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI, *MF);
3602 if (Is16Bit) {
3603 // We just need to find a correctly sized register class, so the
3604 // subregister index compatibility doesn't matter since we're statically
3605 // extracting the immediate value.
3606 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3607 if (!MovDstRC)
3608 continue;
3609
3610 if (MovDstPhysReg) {
3611 // FIXME: We probably should not do this. If there is a live value in
3612 // the high half of the register, it will be corrupted.
3613 MovDstPhysReg =
3614 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3615 if (!MovDstPhysReg)
3616 continue;
3617 }
3618 }
3619
3620 // Result class isn't the right size, try the next instruction.
3621 if (MovDstPhysReg) {
3622 if (!MovDstRC->contains(MovDstPhysReg))
3623 return false;
3624 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3625 // TODO: This will be overly conservative in the case of 16-bit virtual
3626 // SGPRs. We could hack up the virtual register uses to use a compatible
3627 // 32-bit class.
3628 continue;
3629 }
3630
3631 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3632
3633 // Ensure the interpreted immediate value is a valid operand in the new
3634 // mov.
3635 //
3636 // FIXME: isImmOperandLegal should have form that doesn't require existing
3637 // MachineInstr or MachineOperand
3638 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3639 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3640 break;
3641
3642 NewOpc = MovOp;
3643 break;
3644 }
3645
3646 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3647 return false;
3648
3649 if (Is16Bit) {
3650 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3651 if (MovDstPhysReg)
3652 UseMI.getOperand(0).setReg(MovDstPhysReg);
3653 assert(UseMI.getOperand(1).getReg().isVirtual());
3654 }
3655
3656 const MCInstrDesc &NewMCID = get(NewOpc);
3657 UseMI.setDesc(NewMCID);
3658 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3659 UseMI.addImplicitDefUseOperands(*MF);
3660 return true;
3661 }
3662
3663 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3664 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3665 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3666 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3667 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3668 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3669 Opc == AMDGPU::V_FMAC_F64_e64) {
3670 // Don't fold if we are using source or output modifiers. The new VOP2
3671 // instructions don't have them.
3673 return false;
3674
3675 // If this is a free constant, there's no reason to do this.
3676 // TODO: We could fold this here instead of letting SIFoldOperands do it
3677 // later.
3678 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3679
3680 // Any src operand can be used for the legality check.
3681 if (isInlineConstant(UseMI, Src0Idx, Imm))
3682 return false;
3683
3684 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3685
3686 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3687 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3688
3689 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3690 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3691 (Src1->isReg() && Src1->getReg() == Reg)) {
3692 MachineOperand *RegSrc =
3693 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3694 if (!RegSrc->isReg())
3695 return false;
3696 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3697 ST.getConstantBusLimit(Opc) < 2)
3698 return false;
3699
3700 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3701 return false;
3702
3703 // If src2 is also a literal constant then we have to choose which one to
3704 // fold. In general it is better to choose madak so that the other literal
3705 // can be materialized in an sgpr instead of a vgpr:
3706 // s_mov_b32 s0, literal
3707 // v_madak_f32 v0, s0, v0, literal
3708 // Instead of:
3709 // v_mov_b32 v1, literal
3710 // v_madmk_f32 v0, v0, literal, v1
3711 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3712 if (Def && Def->isMoveImmediate() &&
3713 !isInlineConstant(Def->getOperand(1)))
3714 return false;
3715
3716 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3717 if (pseudoToMCOpcode(NewOpc) == -1)
3718 return false;
3719
3720 // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3721 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3722 // restricting their register classes. For now just bail out.
3723 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3724 NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3725 return false;
3726
3727 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3728 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3729
3730 // FIXME: This would be a lot easier if we could return a new instruction
3731 // instead of having to modify in place.
3732
3733 Register SrcReg = RegSrc->getReg();
3734 unsigned SrcSubReg = RegSrc->getSubReg();
3735 Src0->setReg(SrcReg);
3736 Src0->setSubReg(SrcSubReg);
3737 Src0->setIsKill(RegSrc->isKill());
3738
3739 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3740 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3741 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3742 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3743 UseMI.untieRegOperand(
3744 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3745
3746 Src1->ChangeToImmediate(*SubRegImm);
3747
3749 UseMI.setDesc(get(NewOpc));
3750
3751 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3752 if (DeleteDef)
3753 DefMI.eraseFromParent();
3754
3755 return true;
3756 }
3757
3758 // Added part is the constant: Use v_madak_{f16, f32}.
3759 if (Src2->isReg() && Src2->getReg() == Reg) {
3760 if (ST.getConstantBusLimit(Opc) < 2) {
3761 // Not allowed to use constant bus for another operand.
3762 // We can however allow an inline immediate as src0.
3763 bool Src0Inlined = false;
3764 if (Src0->isReg()) {
3765 // Try to inline constant if possible.
3766 // If the Def moves immediate and the use is single
3767 // We are saving VGPR here.
3768 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3769 if (Def && Def->isMoveImmediate() &&
3770 isInlineConstant(Def->getOperand(1)) &&
3771 MRI->hasOneUse(Src0->getReg())) {
3772 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3773 Src0Inlined = true;
3774 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3775 RI.isSGPRReg(*MRI, Src0->getReg())) {
3776 return false;
3777 }
3778 // VGPR is okay as Src0 - fallthrough
3779 }
3780
3781 if (Src1->isReg() && !Src0Inlined) {
3782 // We have one slot for inlinable constant so far - try to fill it
3783 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3784 if (Def && Def->isMoveImmediate() &&
3785 isInlineConstant(Def->getOperand(1)) &&
3786 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3787 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3788 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3789 return false;
3790 // VGPR is okay as Src1 - fallthrough
3791 }
3792 }
3793
3794 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3795 if (pseudoToMCOpcode(NewOpc) == -1)
3796 return false;
3797
3798 // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3799 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3800 // restricting their register classes. For now just bail out.
3801 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3802 NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3803 return false;
3804
3805 // FIXME: This would be a lot easier if we could return a new instruction
3806 // instead of having to modify in place.
3807
3808 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3809 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3810 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3811 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3812 UseMI.untieRegOperand(
3813 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3814
3815 const std::optional<int64_t> SubRegImm =
3816 extractSubregFromImm(Imm, Src2->getSubReg());
3817
3818 // ChangingToImmediate adds Src2 back to the instruction.
3819 Src2->ChangeToImmediate(*SubRegImm);
3820
3821 // These come before src2.
3823 UseMI.setDesc(get(NewOpc));
3824 // It might happen that UseMI was commuted
3825 // and we now have SGPR as SRC1. If so 2 inlined
3826 // constant and SGPR are illegal.
3828
3829 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3830 if (DeleteDef)
3831 DefMI.eraseFromParent();
3832
3833 return true;
3834 }
3835 }
3836
3837 return false;
3838}
3839
3840static bool
3843 if (BaseOps1.size() != BaseOps2.size())
3844 return false;
3845 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3846 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3847 return false;
3848 }
3849 return true;
3850}
3851
3852static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3853 LocationSize WidthB, int OffsetB) {
3854 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3855 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3856 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3857 return LowWidth.hasValue() &&
3858 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3859}
3860
3861bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3862 const MachineInstr &MIb) const {
3863 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3864 int64_t Offset0, Offset1;
3867 bool Offset0IsScalable, Offset1IsScalable;
3868 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3869 Dummy0, &RI) ||
3870 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3871 Dummy1, &RI))
3872 return false;
3873
3874 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3875 return false;
3876
3877 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3878 // FIXME: Handle ds_read2 / ds_write2.
3879 return false;
3880 }
3881 LocationSize Width0 = MIa.memoperands().front()->getSize();
3882 LocationSize Width1 = MIb.memoperands().front()->getSize();
3883 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3884}
3885
3887 const MachineInstr &MIb) const {
3888 assert(MIa.mayLoadOrStore() &&
3889 "MIa must load from or modify a memory location");
3890 assert(MIb.mayLoadOrStore() &&
3891 "MIb must load from or modify a memory location");
3892
3894 return false;
3895
3896 // XXX - Can we relax this between address spaces?
3897 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3898 return false;
3899
3900 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3901 return false;
3902
3903 // TODO: Should we check the address space from the MachineMemOperand? That
3904 // would allow us to distinguish objects we know don't alias based on the
3905 // underlying address space, even if it was lowered to a different one,
3906 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3907 // buffer.
3908 if (isDS(MIa)) {
3909 if (isDS(MIb))
3910 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3911
3912 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3913 }
3914
3915 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3916 if (isMUBUF(MIb) || isMTBUF(MIb))
3917 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3918
3919 if (isFLAT(MIb))
3920 return isFLATScratch(MIb);
3921
3922 return !isSMRD(MIb);
3923 }
3924
3925 if (isSMRD(MIa)) {
3926 if (isSMRD(MIb))
3927 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3928
3929 if (isFLAT(MIb))
3930 return isFLATScratch(MIb);
3931
3932 return !isMUBUF(MIb) && !isMTBUF(MIb);
3933 }
3934
3935 if (isFLAT(MIa)) {
3936 if (isFLAT(MIb)) {
3937 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3938 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3939 return true;
3940
3941 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3942 }
3943
3944 return false;
3945 }
3946
3947 return false;
3948}
3949
3951 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3952 if (Reg.isPhysical())
3953 return false;
3954 auto *Def = MRI.getUniqueVRegDef(Reg);
3955 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3956 Imm = Def->getOperand(1).getImm();
3957 if (DefMI)
3958 *DefMI = Def;
3959 return true;
3960 }
3961 return false;
3962}
3963
3964static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3965 MachineInstr **DefMI = nullptr) {
3966 if (!MO->isReg())
3967 return false;
3968 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3969 const MachineRegisterInfo &MRI = MF->getRegInfo();
3970 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3971}
3972
3974 MachineInstr &NewMI) {
3975 if (LV) {
3976 unsigned NumOps = MI.getNumOperands();
3977 for (unsigned I = 1; I < NumOps; ++I) {
3978 MachineOperand &Op = MI.getOperand(I);
3979 if (Op.isReg() && Op.isKill())
3980 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3981 }
3982 }
3983}
3984
3985static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
3986 switch (Opc) {
3987 case AMDGPU::V_MAC_F16_e32:
3988 case AMDGPU::V_MAC_F16_e64:
3989 return AMDGPU::V_MAD_F16_e64;
3990 case AMDGPU::V_MAC_F32_e32:
3991 case AMDGPU::V_MAC_F32_e64:
3992 return AMDGPU::V_MAD_F32_e64;
3993 case AMDGPU::V_MAC_LEGACY_F32_e32:
3994 case AMDGPU::V_MAC_LEGACY_F32_e64:
3995 return AMDGPU::V_MAD_LEGACY_F32_e64;
3996 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3997 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3998 return AMDGPU::V_FMA_LEGACY_F32_e64;
3999 case AMDGPU::V_FMAC_F16_e32:
4000 case AMDGPU::V_FMAC_F16_e64:
4001 case AMDGPU::V_FMAC_F16_t16_e64:
4002 case AMDGPU::V_FMAC_F16_fake16_e64:
4003 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4004 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4005 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4006 : AMDGPU::V_FMA_F16_gfx9_e64;
4007 case AMDGPU::V_FMAC_F32_e32:
4008 case AMDGPU::V_FMAC_F32_e64:
4009 return AMDGPU::V_FMA_F32_e64;
4010 case AMDGPU::V_FMAC_F64_e32:
4011 case AMDGPU::V_FMAC_F64_e64:
4012 return AMDGPU::V_FMA_F64_e64;
4013 default:
4014 llvm_unreachable("invalid instruction");
4015 }
4016}
4017
4019 LiveVariables *LV,
4020 LiveIntervals *LIS) const {
4021 MachineBasicBlock &MBB = *MI.getParent();
4022 unsigned Opc = MI.getOpcode();
4023
4024 // Handle MFMA.
4025 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4026 if (NewMFMAOpc != -1) {
4028 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4029 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4030 MIB.add(MI.getOperand(I));
4031 updateLiveVariables(LV, MI, *MIB);
4032 if (LIS) {
4033 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4034 // SlotIndex of defs needs to be updated when converting to early-clobber
4035 MachineOperand &Def = MIB->getOperand(0);
4036 if (Def.isEarlyClobber() && Def.isReg() &&
4037 LIS->hasInterval(Def.getReg())) {
4038 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
4039 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
4040 auto &LI = LIS->getInterval(Def.getReg());
4041 auto UpdateDefIndex = [&](LiveRange &LR) {
4042 auto *S = LR.find(OldIndex);
4043 if (S != LR.end() && S->start == OldIndex) {
4044 assert(S->valno && S->valno->def == OldIndex);
4045 S->start = NewIndex;
4046 S->valno->def = NewIndex;
4047 }
4048 };
4049 UpdateDefIndex(LI);
4050 for (auto &SR : LI.subranges())
4051 UpdateDefIndex(SR);
4052 }
4053 }
4054 return MIB;
4055 }
4056
4057 if (SIInstrInfo::isWMMA(MI)) {
4058 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4059 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4060 .setMIFlags(MI.getFlags());
4061 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4062 MIB->addOperand(MI.getOperand(I));
4063
4064 updateLiveVariables(LV, MI, *MIB);
4065 if (LIS)
4066 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4067
4068 return MIB;
4069 }
4070
4071 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4072 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4073 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4074 "present pre-RA");
4075
4076 // Handle MAC/FMAC.
4077 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4078 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4079 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4080 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4081 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4082 bool Src0Literal = false;
4083
4084 switch (Opc) {
4085 default:
4086 return nullptr;
4087 case AMDGPU::V_MAC_F16_e64:
4088 case AMDGPU::V_FMAC_F16_e64:
4089 case AMDGPU::V_FMAC_F16_t16_e64:
4090 case AMDGPU::V_FMAC_F16_fake16_e64:
4091 case AMDGPU::V_MAC_F32_e64:
4092 case AMDGPU::V_MAC_LEGACY_F32_e64:
4093 case AMDGPU::V_FMAC_F32_e64:
4094 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4095 case AMDGPU::V_FMAC_F64_e64:
4096 break;
4097 case AMDGPU::V_MAC_F16_e32:
4098 case AMDGPU::V_FMAC_F16_e32:
4099 case AMDGPU::V_MAC_F32_e32:
4100 case AMDGPU::V_MAC_LEGACY_F32_e32:
4101 case AMDGPU::V_FMAC_F32_e32:
4102 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4103 case AMDGPU::V_FMAC_F64_e32: {
4104 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4105 AMDGPU::OpName::src0);
4106 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4107 if (!Src0->isReg() && !Src0->isImm())
4108 return nullptr;
4109
4110 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4111 Src0Literal = true;
4112
4113 break;
4114 }
4115 }
4116
4118 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4119 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4120 const MachineOperand *Src0Mods =
4121 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4122 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4123 const MachineOperand *Src1Mods =
4124 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4125 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4126 const MachineOperand *Src2Mods =
4127 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4128 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4129 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4130 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4131
4132 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4133 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4134 // If we have an SGPR input, we will violate the constant bus restriction.
4135 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4136 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4138 const auto killDef = [&]() -> void {
4140 // The only user is the instruction which will be killed.
4141 Register DefReg = DefMI->getOperand(0).getReg();
4142
4143 if (MRI.hasOneNonDBGUse(DefReg)) {
4144 // We cannot just remove the DefMI here, calling pass will crash.
4145 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
4146 DefMI->getOperand(0).setIsDead(true);
4147 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
4149 if (LV)
4150 LV->getVarInfo(DefReg).AliveBlocks.clear();
4151 }
4152
4153 if (LIS) {
4154 LiveInterval &DefLI = LIS->getInterval(DefReg);
4155
4156 // We cannot delete the original instruction here, so hack out the use
4157 // in the original instruction with a dummy register so we can use
4158 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4159 // not have the complexity of deleting a use to consider here.
4160 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4161 for (MachineOperand &MIOp : MI.uses()) {
4162 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4163 MIOp.setIsUndef(true);
4164 MIOp.setReg(DummyReg);
4165 }
4166 }
4167
4168 LIS->shrinkToUses(&DefLI);
4169 }
4170 };
4171
4172 int64_t Imm;
4173 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4174 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4175 if (pseudoToMCOpcode(NewOpc) != -1) {
4176 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4177 .add(*Dst)
4178 .add(*Src0)
4179 .add(*Src1)
4180 .addImm(Imm)
4181 .setMIFlags(MI.getFlags());
4182 updateLiveVariables(LV, MI, *MIB);
4183 if (LIS)
4184 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4185 killDef();
4186 return MIB;
4187 }
4188 }
4189 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4190 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4191 if (pseudoToMCOpcode(NewOpc) != -1) {
4192 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4193 .add(*Dst)
4194 .add(*Src0)
4195 .addImm(Imm)
4196 .add(*Src2)
4197 .setMIFlags(MI.getFlags());
4198 updateLiveVariables(LV, MI, *MIB);
4199
4200 if (LIS)
4201 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4202 killDef();
4203 return MIB;
4204 }
4205 }
4206 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4207 if (Src0Literal) {
4208 Imm = Src0->getImm();
4209 DefMI = nullptr;
4210 }
4211 if (pseudoToMCOpcode(NewOpc) != -1 &&
4213 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4214 Src1)) {
4215 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4216 .add(*Dst)
4217 .add(*Src1)
4218 .addImm(Imm)
4219 .add(*Src2)
4220 .setMIFlags(MI.getFlags());
4221 updateLiveVariables(LV, MI, *MIB);
4222
4223 if (LIS)
4224 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4225 if (DefMI)
4226 killDef();
4227 return MIB;
4228 }
4229 }
4230 }
4231
4232 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4233 // if VOP3 does not allow a literal operand.
4234 if (Src0Literal && !ST.hasVOP3Literal())
4235 return nullptr;
4236
4237 unsigned NewOpc = getNewFMAInst(ST, Opc);
4238
4239 if (pseudoToMCOpcode(NewOpc) == -1)
4240 return nullptr;
4241
4242 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4243 .add(*Dst)
4244 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4245 .add(*Src0)
4246 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4247 .add(*Src1)
4248 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4249 .add(*Src2)
4250 .addImm(Clamp ? Clamp->getImm() : 0)
4251 .addImm(Omod ? Omod->getImm() : 0)
4252 .setMIFlags(MI.getFlags());
4253 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4254 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4255 updateLiveVariables(LV, MI, *MIB);
4256 if (LIS)
4257 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4258 return MIB;
4259}
4260
4261// It's not generally safe to move VALU instructions across these since it will
4262// start using the register as a base index rather than directly.
4263// XXX - Why isn't hasSideEffects sufficient for these?
4265 switch (MI.getOpcode()) {
4266 case AMDGPU::S_SET_GPR_IDX_ON:
4267 case AMDGPU::S_SET_GPR_IDX_MODE:
4268 case AMDGPU::S_SET_GPR_IDX_OFF:
4269 return true;
4270 default:
4271 return false;
4272 }
4273}
4274
4276 const MachineBasicBlock *MBB,
4277 const MachineFunction &MF) const {
4278 // Skipping the check for SP writes in the base implementation. The reason it
4279 // was added was apparently due to compile time concerns.
4280 //
4281 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4282 // but is probably avoidable.
4283
4284 // Copied from base implementation.
4285 // Terminators and labels can't be scheduled around.
4286 if (MI.isTerminator() || MI.isPosition())
4287 return true;
4288
4289 // INLINEASM_BR can jump to another block
4290 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4291 return true;
4292
4293 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4294 return true;
4295
4296 // Target-independent instructions do not have an implicit-use of EXEC, even
4297 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4298 // boundaries prevents incorrect movements of such instructions.
4299 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4300 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4301 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4302 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4303 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4305}
4306
4308 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4309 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4310 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4311}
4312
4314 if (!isFLAT(MI) || isFLATGlobal(MI))
4315 return false;
4316
4317 // If scratch is not initialized, we can never access it.
4318 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4319 return false;
4320
4321 // SCRATCH instructions always access scratch.
4322 if (isFLATScratch(MI))
4323 return true;
4324
4325 // If there are no memory operands then conservatively assume the flat
4326 // operation may access scratch.
4327 if (MI.memoperands_empty())
4328 return true;
4329
4330 // See if any memory operand specifies an address space that involves scratch.
4331 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4332 unsigned AS = Memop->getAddrSpace();
4333 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4334 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4335 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4336 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4337 }
4338 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4339 });
4340}
4341
4343 // Skip the full operand and register alias search modifiesRegister
4344 // does. There's only a handful of instructions that touch this, it's only an
4345 // implicit def, and doesn't alias any other registers.
4346 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4347}
4348
4350 unsigned Opcode = MI.getOpcode();
4351
4352 if (MI.mayStore() && isSMRD(MI))
4353 return true; // scalar store or atomic
4354
4355 // This will terminate the function when other lanes may need to continue.
4356 if (MI.isReturn())
4357 return true;
4358
4359 // These instructions cause shader I/O that may cause hardware lockups
4360 // when executed with an empty EXEC mask.
4361 //
4362 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4363 // EXEC = 0, but checking for that case here seems not worth it
4364 // given the typical code patterns.
4365 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4366 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4367 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4368 return true;
4369
4370 if (MI.isCall() || MI.isInlineAsm())
4371 return true; // conservative assumption
4372
4373 // Assume that barrier interactions are only intended with active lanes.
4374 if (isBarrier(Opcode))
4375 return true;
4376
4377 // A mode change is a scalar operation that influences vector instructions.
4379 return true;
4380
4381 // These are like SALU instructions in terms of effects, so it's questionable
4382 // whether we should return true for those.
4383 //
4384 // However, executing them with EXEC = 0 causes them to operate on undefined
4385 // data, which we avoid by returning true here.
4386 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4387 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4388 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4389 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4390 return true;
4391
4392 return false;
4393}
4394
4396 const MachineInstr &MI) const {
4397 if (MI.isMetaInstruction())
4398 return false;
4399
4400 // This won't read exec if this is an SGPR->SGPR copy.
4401 if (MI.isCopyLike()) {
4402 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4403 return true;
4404
4405 // Make sure this isn't copying exec as a normal operand
4406 return MI.readsRegister(AMDGPU::EXEC, &RI);
4407 }
4408
4409 // Make a conservative assumption about the callee.
4410 if (MI.isCall())
4411 return true;
4412
4413 // Be conservative with any unhandled generic opcodes.
4414 if (!isTargetSpecificOpcode(MI.getOpcode()))
4415 return true;
4416
4417 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4418}
4419
4420bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4421 switch (Imm.getBitWidth()) {
4422 case 1: // This likely will be a condition code mask.
4423 return true;
4424
4425 case 32:
4426 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4427 ST.hasInv2PiInlineImm());
4428 case 64:
4429 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4430 ST.hasInv2PiInlineImm());
4431 case 16:
4432 return ST.has16BitInsts() &&
4433 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4434 ST.hasInv2PiInlineImm());
4435 default:
4436 llvm_unreachable("invalid bitwidth");
4437 }
4438}
4439
4441 APInt IntImm = Imm.bitcastToAPInt();
4442 int64_t IntImmVal = IntImm.getSExtValue();
4443 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4444 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4445 default:
4446 llvm_unreachable("invalid fltSemantics");
4449 return isInlineConstant(IntImm);
4451 return ST.has16BitInsts() &&
4452 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4454 return ST.has16BitInsts() &&
4455 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4456 }
4457}
4458
4459bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4460 // MachineOperand provides no way to tell the true operand size, since it only
4461 // records a 64-bit value. We need to know the size to determine if a 32-bit
4462 // floating point immediate bit pattern is legal for an integer immediate. It
4463 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4464 switch (OperandType) {
4474 int32_t Trunc = static_cast<int32_t>(Imm);
4476 }
4485 // We would expect inline immediates to not be concerned with an integer/fp
4486 // distinction. However, in the case of 16-bit integer operations, the
4487 // "floating point" values appear to not work. It seems read the low 16-bits
4488 // of 32-bit immediates, which happens to always work for the integer
4489 // values.
4490 //
4491 // See llvm bugzilla 46302.
4492 //
4493 // TODO: Theoretically we could use op-sel to use the high bits of the
4494 // 32-bit FP values.
4506 return false;
4509 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4510 // A few special case instructions have 16-bit operands on subtargets
4511 // where 16-bit instructions are not legal.
4512 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4513 // constants in these cases
4514 int16_t Trunc = static_cast<int16_t>(Imm);
4515 return ST.has16BitInsts() &&
4517 }
4518
4519 return false;
4520 }
4523 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4524 int16_t Trunc = static_cast<int16_t>(Imm);
4525 return ST.has16BitInsts() &&
4527 }
4528 return false;
4529 }
4533 return false;
4535 return isLegalAV64PseudoImm(Imm);
4538 // Always embedded in the instruction for free.
4539 return true;
4549 // Just ignore anything else.
4550 return true;
4551 default:
4552 llvm_unreachable("invalid operand type");
4553 }
4554}
4555
4556static bool compareMachineOp(const MachineOperand &Op0,
4557 const MachineOperand &Op1) {
4558 if (Op0.getType() != Op1.getType())
4559 return false;
4560
4561 switch (Op0.getType()) {
4563 return Op0.getReg() == Op1.getReg();
4565 return Op0.getImm() == Op1.getImm();
4566 default:
4567 llvm_unreachable("Didn't expect to be comparing these operand types");
4568 }
4569}
4570
4572 const MCOperandInfo &OpInfo) const {
4574 return true;
4575
4576 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4577 return false;
4578
4579 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4580 return true;
4581
4582 return ST.hasVOP3Literal();
4583}
4584
4585bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4586 int64_t ImmVal) const {
4587 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4588 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4589 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4590 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4591 AMDGPU::OpName::src2))
4592 return false;
4593 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4594 }
4595
4596 return isLiteralOperandLegal(InstDesc, OpInfo);
4597}
4598
4599bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4600 const MachineOperand &MO) const {
4601 if (MO.isImm())
4602 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4603
4604 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4605 "unexpected imm-like operand kind");
4606 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4607 return isLiteralOperandLegal(InstDesc, OpInfo);
4608}
4609
4611 // 2 32-bit inline constants packed into one.
4614}
4615
4616bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4617 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4618 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4619 return false;
4620
4621 int Op32 = AMDGPU::getVOPe32(Opcode);
4622 if (Op32 == -1)
4623 return false;
4624
4625 return pseudoToMCOpcode(Op32) != -1;
4626}
4627
4628bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4629 // The src0_modifier operand is present on all instructions
4630 // that have modifiers.
4631
4632 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4633}
4634
4636 AMDGPU::OpName OpName) const {
4637 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4638 return Mods && Mods->getImm();
4639}
4640
4642 return any_of(ModifierOpNames,
4643 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4644}
4645
4647 const MachineRegisterInfo &MRI) const {
4648 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4649 // Can't shrink instruction with three operands.
4650 if (Src2) {
4651 switch (MI.getOpcode()) {
4652 default: return false;
4653
4654 case AMDGPU::V_ADDC_U32_e64:
4655 case AMDGPU::V_SUBB_U32_e64:
4656 case AMDGPU::V_SUBBREV_U32_e64: {
4657 const MachineOperand *Src1
4658 = getNamedOperand(MI, AMDGPU::OpName::src1);
4659 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4660 return false;
4661 // Additional verification is needed for sdst/src2.
4662 return true;
4663 }
4664 case AMDGPU::V_MAC_F16_e64:
4665 case AMDGPU::V_MAC_F32_e64:
4666 case AMDGPU::V_MAC_LEGACY_F32_e64:
4667 case AMDGPU::V_FMAC_F16_e64:
4668 case AMDGPU::V_FMAC_F16_t16_e64:
4669 case AMDGPU::V_FMAC_F16_fake16_e64:
4670 case AMDGPU::V_FMAC_F32_e64:
4671 case AMDGPU::V_FMAC_F64_e64:
4672 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4673 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4674 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4675 return false;
4676 break;
4677
4678 case AMDGPU::V_CNDMASK_B32_e64:
4679 break;
4680 }
4681 }
4682
4683 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4684 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4685 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4686 return false;
4687
4688 // We don't need to check src0, all input types are legal, so just make sure
4689 // src0 isn't using any modifiers.
4690 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4691 return false;
4692
4693 // Can it be shrunk to a valid 32 bit opcode?
4694 if (!hasVALU32BitEncoding(MI.getOpcode()))
4695 return false;
4696
4697 // Check output modifiers
4698 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4699 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4700 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4701 // TODO: Can we avoid checking bound_ctrl/fi here?
4702 // They are only used by permlane*_swap special case.
4703 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4704 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4705}
4706
4707// Set VCC operand with all flags from \p Orig, except for setting it as
4708// implicit.
4710 const MachineOperand &Orig) {
4711
4712 for (MachineOperand &Use : MI.implicit_operands()) {
4713 if (Use.isUse() &&
4714 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4715 Use.setIsUndef(Orig.isUndef());
4716 Use.setIsKill(Orig.isKill());
4717 return;
4718 }
4719 }
4720}
4721
4723 unsigned Op32) const {
4724 MachineBasicBlock *MBB = MI.getParent();
4725
4726 const MCInstrDesc &Op32Desc = get(Op32);
4727 MachineInstrBuilder Inst32 =
4728 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4729 .setMIFlags(MI.getFlags());
4730
4731 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4732 // For VOPC instructions, this is replaced by an implicit def of vcc.
4733
4734 // We assume the defs of the shrunk opcode are in the same order, and the
4735 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4736 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4737 Inst32.add(MI.getOperand(I));
4738
4739 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4740
4741 int Idx = MI.getNumExplicitDefs();
4742 for (const MachineOperand &Use : MI.explicit_uses()) {
4743 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4745 continue;
4746
4747 if (&Use == Src2) {
4748 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4749 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4750 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4751 // of vcc was already added during the initial BuildMI, but we
4752 // 1) may need to change vcc to vcc_lo to preserve the original register
4753 // 2) have to preserve the original flags.
4754 copyFlagsToImplicitVCC(*Inst32, *Src2);
4755 continue;
4756 }
4757 }
4758
4759 Inst32.add(Use);
4760 }
4761
4762 // FIXME: Losing implicit operands
4763 fixImplicitOperands(*Inst32);
4764 return Inst32;
4765}
4766
4768 // Null is free
4769 Register Reg = RegOp.getReg();
4770 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4771 return false;
4772
4773 // SGPRs use the constant bus
4774
4775 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
4776 // physical register operands should also count, except for exec.
4777 if (RegOp.isImplicit())
4778 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
4779
4780 // SGPRs use the constant bus
4781 return AMDGPU::SReg_32RegClass.contains(Reg) ||
4782 AMDGPU::SReg_64RegClass.contains(Reg);
4783}
4784
4786 const MachineRegisterInfo &MRI) const {
4787 Register Reg = RegOp.getReg();
4788 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4789 : physRegUsesConstantBus(RegOp);
4790}
4791
4793 const MachineOperand &MO,
4794 const MCOperandInfo &OpInfo) const {
4795 // Literal constants use the constant bus.
4796 if (!MO.isReg())
4797 return !isInlineConstant(MO, OpInfo);
4798
4799 Register Reg = MO.getReg();
4800 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4802}
4803
4805 for (const MachineOperand &MO : MI.implicit_operands()) {
4806 // We only care about reads.
4807 if (MO.isDef())
4808 continue;
4809
4810 switch (MO.getReg()) {
4811 case AMDGPU::VCC:
4812 case AMDGPU::VCC_LO:
4813 case AMDGPU::VCC_HI:
4814 case AMDGPU::M0:
4815 case AMDGPU::FLAT_SCR:
4816 return MO.getReg();
4817
4818 default:
4819 break;
4820 }
4821 }
4822
4823 return Register();
4824}
4825
4826static bool shouldReadExec(const MachineInstr &MI) {
4827 if (SIInstrInfo::isVALU(MI)) {
4828 switch (MI.getOpcode()) {
4829 case AMDGPU::V_READLANE_B32:
4830 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4831 case AMDGPU::V_WRITELANE_B32:
4832 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4833 return false;
4834 }
4835
4836 return true;
4837 }
4838
4839 if (MI.isPreISelOpcode() ||
4840 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4843 return false;
4844
4845 return true;
4846}
4847
4848static bool isRegOrFI(const MachineOperand &MO) {
4849 return MO.isReg() || MO.isFI();
4850}
4851
4852static bool isSubRegOf(const SIRegisterInfo &TRI,
4853 const MachineOperand &SuperVec,
4854 const MachineOperand &SubReg) {
4855 if (SubReg.getReg().isPhysical())
4856 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4857
4858 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4859 SubReg.getReg() == SuperVec.getReg();
4860}
4861
4862// Verify the illegal copy from vector register to SGPR for generic opcode COPY
4863bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4864 const MachineRegisterInfo &MRI,
4865 StringRef &ErrInfo) const {
4866 Register DstReg = MI.getOperand(0).getReg();
4867 Register SrcReg = MI.getOperand(1).getReg();
4868 // This is a check for copy from vector register to SGPR
4869 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
4870 ErrInfo = "illegal copy from vector register to SGPR";
4871 return false;
4872 }
4873 return true;
4874}
4875
4877 StringRef &ErrInfo) const {
4878 uint16_t Opcode = MI.getOpcode();
4879 const MachineFunction *MF = MI.getParent()->getParent();
4880 const MachineRegisterInfo &MRI = MF->getRegInfo();
4881
4882 // FIXME: At this point the COPY verify is done only for non-ssa forms.
4883 // Find a better property to recognize the point where instruction selection
4884 // is just done.
4885 // We can only enforce this check after SIFixSGPRCopies pass so that the
4886 // illegal copies are legalized and thereafter we don't expect a pass
4887 // inserting similar copies.
4888 if (!MRI.isSSA() && MI.isCopy())
4889 return verifyCopy(MI, MRI, ErrInfo);
4890
4891 if (SIInstrInfo::isGenericOpcode(Opcode))
4892 return true;
4893
4894 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4895 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4896 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4897 int Src3Idx = -1;
4898 if (Src0Idx == -1) {
4899 // VOPD V_DUAL_* instructions use different operand names.
4900 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4901 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4902 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4903 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4904 }
4905
4906 // Make sure the number of operands is correct.
4907 const MCInstrDesc &Desc = get(Opcode);
4908 if (!Desc.isVariadic() &&
4909 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4910 ErrInfo = "Instruction has wrong number of operands.";
4911 return false;
4912 }
4913
4914 if (MI.isInlineAsm()) {
4915 // Verify register classes for inlineasm constraints.
4916 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4917 I != E; ++I) {
4918 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4919 if (!RC)
4920 continue;
4921
4922 const MachineOperand &Op = MI.getOperand(I);
4923 if (!Op.isReg())
4924 continue;
4925
4926 Register Reg = Op.getReg();
4927 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4928 ErrInfo = "inlineasm operand has incorrect register class.";
4929 return false;
4930 }
4931 }
4932
4933 return true;
4934 }
4935
4936 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4937 ErrInfo = "missing memory operand from image instruction.";
4938 return false;
4939 }
4940
4941 // Make sure the register classes are correct.
4942 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4943 const MachineOperand &MO = MI.getOperand(i);
4944 if (MO.isFPImm()) {
4945 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4946 "all fp values to integers.";
4947 return false;
4948 }
4949
4950 int RegClass = Desc.operands()[i].RegClass;
4951
4952 const MCOperandInfo &OpInfo = Desc.operands()[i];
4953 switch (OpInfo.OperandType) {
4955 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4956 ErrInfo = "Illegal immediate value for operand.";
4957 return false;
4958 }
4959 break;
4972 break;
4974 break;
4975 break;
4989 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4990 ErrInfo = "Illegal immediate value for operand.";
4991 return false;
4992 }
4993 break;
4994 }
4996 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
4997 ErrInfo = "Expected inline constant for operand.";
4998 return false;
4999 }
5000 break;
5004 break;
5009 // Check if this operand is an immediate.
5010 // FrameIndex operands will be replaced by immediates, so they are
5011 // allowed.
5012 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5013 ErrInfo = "Expected immediate, but got non-immediate";
5014 return false;
5015 }
5016 break;
5020 break;
5021 default:
5022 if (OpInfo.isGenericType())
5023 continue;
5024 break;
5025 }
5026
5027 if (!MO.isReg())
5028 continue;
5029 Register Reg = MO.getReg();
5030 if (!Reg)
5031 continue;
5032
5033 // FIXME: Ideally we would have separate instruction definitions with the
5034 // aligned register constraint.
5035 // FIXME: We do not verify inline asm operands, but custom inline asm
5036 // verification is broken anyway
5037 if (ST.needsAlignedVGPRs()) {
5038 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5039 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5040 if (const TargetRegisterClass *SubRC =
5041 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5042 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5043 if (RC)
5044 RC = SubRC;
5045 }
5046 }
5047
5048 // Check that this is the aligned version of the class.
5049 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5050 ErrInfo = "Subtarget requires even aligned vector registers";
5051 return false;
5052 }
5053 }
5054
5055 if (RegClass != -1) {
5056 if (Reg.isVirtual())
5057 continue;
5058
5059 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5060 if (!RC->contains(Reg)) {
5061 ErrInfo = "Operand has incorrect register class.";
5062 return false;
5063 }
5064 }
5065 }
5066
5067 // Verify SDWA
5068 if (isSDWA(MI)) {
5069 if (!ST.hasSDWA()) {
5070 ErrInfo = "SDWA is not supported on this target";
5071 return false;
5072 }
5073
5074 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5075 AMDGPU::OpName::dst_sel}) {
5076 const MachineOperand *MO = getNamedOperand(MI, Op);
5077 if (!MO)
5078 continue;
5079 int64_t Imm = MO->getImm();
5080 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5081 ErrInfo = "Invalid SDWA selection";
5082 return false;
5083 }
5084 }
5085
5086 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5087
5088 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5089 if (OpIdx == -1)
5090 continue;
5091 const MachineOperand &MO = MI.getOperand(OpIdx);
5092
5093 if (!ST.hasSDWAScalar()) {
5094 // Only VGPRS on VI
5095 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5096 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5097 return false;
5098 }
5099 } else {
5100 // No immediates on GFX9
5101 if (!MO.isReg()) {
5102 ErrInfo =
5103 "Only reg allowed as operands in SDWA instructions on GFX9+";
5104 return false;
5105 }
5106 }
5107 }
5108
5109 if (!ST.hasSDWAOmod()) {
5110 // No omod allowed on VI
5111 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5112 if (OMod != nullptr &&
5113 (!OMod->isImm() || OMod->getImm() != 0)) {
5114 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5115 return false;
5116 }
5117 }
5118
5119 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5120 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5121 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5122 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5123 const MachineOperand *Src0ModsMO =
5124 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5125 unsigned Mods = Src0ModsMO->getImm();
5126 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5127 Mods & SISrcMods::SEXT) {
5128 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5129 return false;
5130 }
5131 }
5132
5133 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5134 if (isVOPC(BasicOpcode)) {
5135 if (!ST.hasSDWASdst() && DstIdx != -1) {
5136 // Only vcc allowed as dst on VI for VOPC
5137 const MachineOperand &Dst = MI.getOperand(DstIdx);
5138 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5139 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5140 return false;
5141 }
5142 } else if (!ST.hasSDWAOutModsVOPC()) {
5143 // No clamp allowed on GFX9 for VOPC
5144 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5145 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5146 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5147 return false;
5148 }
5149
5150 // No omod allowed on GFX9 for VOPC
5151 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5152 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5153 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5154 return false;
5155 }
5156 }
5157 }
5158
5159 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5160 if (DstUnused && DstUnused->isImm() &&
5161 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5162 const MachineOperand &Dst = MI.getOperand(DstIdx);
5163 if (!Dst.isReg() || !Dst.isTied()) {
5164 ErrInfo = "Dst register should have tied register";
5165 return false;
5166 }
5167
5168 const MachineOperand &TiedMO =
5169 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5170 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5171 ErrInfo =
5172 "Dst register should be tied to implicit use of preserved register";
5173 return false;
5174 }
5175 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5176 ErrInfo = "Dst register should use same physical register as preserved";
5177 return false;
5178 }
5179 }
5180 }
5181
5182 // Verify MIMG / VIMAGE / VSAMPLE
5183 if (isImage(Opcode) && !MI.mayStore()) {
5184 // Ensure that the return type used is large enough for all the options
5185 // being used TFE/LWE require an extra result register.
5186 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5187 if (DMask) {
5188 uint64_t DMaskImm = DMask->getImm();
5189 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5190 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5191 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5192 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5193
5194 // Adjust for packed 16 bit values
5195 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5196 RegCount = divideCeil(RegCount, 2);
5197
5198 // Adjust if using LWE or TFE
5199 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5200 RegCount += 1;
5201
5202 const uint32_t DstIdx =
5203 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5204 const MachineOperand &Dst = MI.getOperand(DstIdx);
5205 if (Dst.isReg()) {
5206 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5207 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5208 if (RegCount > DstSize) {
5209 ErrInfo = "Image instruction returns too many registers for dst "
5210 "register class";
5211 return false;
5212 }
5213 }
5214 }
5215 }
5216
5217 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5218 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5219 unsigned ConstantBusCount = 0;
5220 bool UsesLiteral = false;
5221 const MachineOperand *LiteralVal = nullptr;
5222
5223 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5224 if (ImmIdx != -1) {
5225 ++ConstantBusCount;
5226 UsesLiteral = true;
5227 LiteralVal = &MI.getOperand(ImmIdx);
5228 }
5229
5230 SmallVector<Register, 2> SGPRsUsed;
5231 Register SGPRUsed;
5232
5233 // Only look at the true operands. Only a real operand can use the constant
5234 // bus, and we don't want to check pseudo-operands like the source modifier
5235 // flags.
5236 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5237 if (OpIdx == -1)
5238 continue;
5239 const MachineOperand &MO = MI.getOperand(OpIdx);
5240 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5241 if (MO.isReg()) {
5242 SGPRUsed = MO.getReg();
5243 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5244 ++ConstantBusCount;
5245 SGPRsUsed.push_back(SGPRUsed);
5246 }
5247 } else if (!MO.isFI()) { // Treat FI like a register.
5248 if (!UsesLiteral) {
5249 ++ConstantBusCount;
5250 UsesLiteral = true;
5251 LiteralVal = &MO;
5252 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5253 assert(isVOP2(MI) || isVOP3(MI));
5254 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5255 return false;
5256 }
5257 }
5258 }
5259 }
5260
5261 SGPRUsed = findImplicitSGPRRead(MI);
5262 if (SGPRUsed) {
5263 // Implicit uses may safely overlap true operands
5264 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5265 return !RI.regsOverlap(SGPRUsed, SGPR);
5266 })) {
5267 ++ConstantBusCount;
5268 SGPRsUsed.push_back(SGPRUsed);
5269 }
5270 }
5271
5272 // v_writelane_b32 is an exception from constant bus restriction:
5273 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5274 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5275 Opcode != AMDGPU::V_WRITELANE_B32) {
5276 ErrInfo = "VOP* instruction violates constant bus restriction";
5277 return false;
5278 }
5279
5280 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5281 ErrInfo = "VOP3 instruction uses literal";
5282 return false;
5283 }
5284 }
5285
5286 // Special case for writelane - this can break the multiple constant bus rule,
5287 // but still can't use more than one SGPR register
5288 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5289 unsigned SGPRCount = 0;
5290 Register SGPRUsed;
5291
5292 for (int OpIdx : {Src0Idx, Src1Idx}) {
5293 if (OpIdx == -1)
5294 break;
5295
5296 const MachineOperand &MO = MI.getOperand(OpIdx);
5297
5298 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5299 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5300 if (MO.getReg() != SGPRUsed)
5301 ++SGPRCount;
5302 SGPRUsed = MO.getReg();
5303 }
5304 }
5305 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5306 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5307 return false;
5308 }
5309 }
5310 }
5311
5312 // Verify misc. restrictions on specific instructions.
5313 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5314 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5315 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5316 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5317 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5318 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5319 if (!compareMachineOp(Src0, Src1) &&
5320 !compareMachineOp(Src0, Src2)) {
5321 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5322 return false;
5323 }
5324 }
5325 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5326 SISrcMods::ABS) ||
5327 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5328 SISrcMods::ABS) ||
5329 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5330 SISrcMods::ABS)) {
5331 ErrInfo = "ABS not allowed in VOP3B instructions";
5332 return false;
5333 }
5334 }
5335
5336 if (isSOP2(MI) || isSOPC(MI)) {
5337 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5338 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5339
5340 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5341 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5342 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5343 !Src0.isIdenticalTo(Src1)) {
5344 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5345 return false;
5346 }
5347 }
5348
5349 if (isSOPK(MI)) {
5350 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5351 if (Desc.isBranch()) {
5352 if (!Op->isMBB()) {
5353 ErrInfo = "invalid branch target for SOPK instruction";
5354 return false;
5355 }
5356 } else {
5357 uint64_t Imm = Op->getImm();
5358 if (sopkIsZext(Opcode)) {
5359 if (!isUInt<16>(Imm)) {
5360 ErrInfo = "invalid immediate for SOPK instruction";
5361 return false;
5362 }
5363 } else {
5364 if (!isInt<16>(Imm)) {
5365 ErrInfo = "invalid immediate for SOPK instruction";
5366 return false;
5367 }
5368 }
5369 }
5370 }
5371
5372 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5373 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5374 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5375 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5376 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5377 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5378
5379 const unsigned StaticNumOps =
5380 Desc.getNumOperands() + Desc.implicit_uses().size();
5381 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5382
5383 // Allow additional implicit operands. This allows a fixup done by the post
5384 // RA scheduler where the main implicit operand is killed and implicit-defs
5385 // are added for sub-registers that remain live after this instruction.
5386 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5387 ErrInfo = "missing implicit register operands";
5388 return false;
5389 }
5390
5391 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5392 if (IsDst) {
5393 if (!Dst->isUse()) {
5394 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5395 return false;
5396 }
5397
5398 unsigned UseOpIdx;
5399 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5400 UseOpIdx != StaticNumOps + 1) {
5401 ErrInfo = "movrel implicit operands should be tied";
5402 return false;
5403 }
5404 }
5405
5406 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5407 const MachineOperand &ImpUse
5408 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5409 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5410 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5411 ErrInfo = "src0 should be subreg of implicit vector use";
5412 return false;
5413 }
5414 }
5415
5416 // Make sure we aren't losing exec uses in the td files. This mostly requires
5417 // being careful when using let Uses to try to add other use registers.
5418 if (shouldReadExec(MI)) {
5419 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5420 ErrInfo = "VALU instruction does not implicitly read exec mask";
5421 return false;
5422 }
5423 }
5424
5425 if (isSMRD(MI)) {
5426 if (MI.mayStore() &&
5428 // The register offset form of scalar stores may only use m0 as the
5429 // soffset register.
5430 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5431 if (Soff && Soff->getReg() != AMDGPU::M0) {
5432 ErrInfo = "scalar stores must use m0 as offset register";
5433 return false;
5434 }
5435 }
5436 }
5437
5438 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5439 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5440 if (Offset->getImm() != 0) {
5441 ErrInfo = "subtarget does not support offsets in flat instructions";
5442 return false;
5443 }
5444 }
5445
5446 if (isDS(MI) && !ST.hasGDS()) {
5447 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5448 if (GDSOp && GDSOp->getImm() != 0) {
5449 ErrInfo = "GDS is not supported on this subtarget";
5450 return false;
5451 }
5452 }
5453
5454 if (isImage(MI)) {
5455 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5456 if (DimOp) {
5457 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5458 AMDGPU::OpName::vaddr0);
5459 AMDGPU::OpName RSrcOpName =
5460 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5461 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5462 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5463 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5465 const AMDGPU::MIMGDimInfo *Dim =
5467
5468 if (!Dim) {
5469 ErrInfo = "dim is out of range";
5470 return false;
5471 }
5472
5473 bool IsA16 = false;
5474 if (ST.hasR128A16()) {
5475 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5476 IsA16 = R128A16->getImm() != 0;
5477 } else if (ST.hasA16()) {
5478 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5479 IsA16 = A16->getImm() != 0;
5480 }
5481
5482 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5483
5484 unsigned AddrWords =
5485 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5486
5487 unsigned VAddrWords;
5488 if (IsNSA) {
5489 VAddrWords = RsrcIdx - VAddr0Idx;
5490 if (ST.hasPartialNSAEncoding() &&
5491 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5492 unsigned LastVAddrIdx = RsrcIdx - 1;
5493 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5494 }
5495 } else {
5496 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5497 if (AddrWords > 12)
5498 AddrWords = 16;
5499 }
5500
5501 if (VAddrWords != AddrWords) {
5502 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5503 << " but got " << VAddrWords << "\n");
5504 ErrInfo = "bad vaddr size";
5505 return false;
5506 }
5507 }
5508 }
5509
5510 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5511 if (DppCt) {
5512 using namespace AMDGPU::DPP;
5513
5514 unsigned DC = DppCt->getImm();
5515 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5516 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5517 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5518 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5519 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5520 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5521 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5522 ErrInfo = "Invalid dpp_ctrl value";
5523 return false;
5524 }
5525 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5527 ErrInfo = "Invalid dpp_ctrl value: "
5528 "wavefront shifts are not supported on GFX10+";
5529 return false;
5530 }
5531 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5533 ErrInfo = "Invalid dpp_ctrl value: "
5534 "broadcasts are not supported on GFX10+";
5535 return false;
5536 }
5537 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5539 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5540 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5541 !ST.hasGFX90AInsts()) {
5542 ErrInfo = "Invalid dpp_ctrl value: "
5543 "row_newbroadcast/row_share is not supported before "
5544 "GFX90A/GFX10";
5545 return false;
5546 }
5547 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5548 ErrInfo = "Invalid dpp_ctrl value: "
5549 "row_share and row_xmask are not supported before GFX10";
5550 return false;
5551 }
5552 }
5553
5554 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5557 ErrInfo = "Invalid dpp_ctrl value: "
5558 "DP ALU dpp only support row_newbcast";
5559 return false;
5560 }
5561 }
5562
5563 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5564 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5565 AMDGPU::OpName DataName =
5566 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5567 const MachineOperand *Data = getNamedOperand(MI, DataName);
5568 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5569 if (Data && !Data->isReg())
5570 Data = nullptr;
5571
5572 if (ST.hasGFX90AInsts()) {
5573 if (Dst && Data &&
5574 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5575 ErrInfo = "Invalid register class: "
5576 "vdata and vdst should be both VGPR or AGPR";
5577 return false;
5578 }
5579 if (Data && Data2 &&
5580 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5581 ErrInfo = "Invalid register class: "
5582 "both data operands should be VGPR or AGPR";
5583 return false;
5584 }
5585 } else {
5586 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5587 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5588 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5589 ErrInfo = "Invalid register class: "
5590 "agpr loads and stores not supported on this GPU";
5591 return false;
5592 }
5593 }
5594 }
5595
5596 if (ST.needsAlignedVGPRs()) {
5597 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5599 if (!Op)
5600 return true;
5601 Register Reg = Op->getReg();
5602 if (Reg.isPhysical())
5603 return !(RI.getHWRegIndex(Reg) & 1);
5604 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5605 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5606 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5607 };
5608
5609 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5610 Opcode == AMDGPU::DS_GWS_BARRIER) {
5611
5612 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5613 ErrInfo = "Subtarget requires even aligned vector registers "
5614 "for DS_GWS instructions";
5615 return false;
5616 }
5617 }
5618
5619 if (isMIMG(MI)) {
5620 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5621 ErrInfo = "Subtarget requires even aligned vector registers "
5622 "for vaddr operand of image instructions";
5623 return false;
5624 }
5625 }
5626 }
5627
5628 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5629 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5630 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5631 ErrInfo = "Invalid register class: "
5632 "v_accvgpr_write with an SGPR is not supported on this GPU";
5633 return false;
5634 }
5635 }
5636
5637 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5638 const MachineOperand &SrcOp = MI.getOperand(1);
5639 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5640 ErrInfo = "pseudo expects only physical SGPRs";
5641 return false;
5642 }
5643 }
5644
5645 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5646 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5647 if (!ST.hasScaleOffset()) {
5648 ErrInfo = "Subtarget does not support offset scaling";
5649 return false;
5650 }
5651 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5652 ErrInfo = "Instruction does not support offset scaling";
5653 return false;
5654 }
5655 }
5656 }
5657
5658 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5659 // information.
5660 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5661 for (unsigned I = 0; I < 3; ++I) {
5663 return false;
5664 }
5665 }
5666
5667 return true;
5668}
5669
5670// It is more readable to list mapped opcodes on the same line.
5671// clang-format off
5672
5674 switch (MI.getOpcode()) {
5675 default: return AMDGPU::INSTRUCTION_LIST_END;
5676 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5677 case AMDGPU::COPY: return AMDGPU::COPY;
5678 case AMDGPU::PHI: return AMDGPU::PHI;
5679 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5680 case AMDGPU::WQM: return AMDGPU::WQM;
5681 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5682 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5683 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5684 case AMDGPU::S_MOV_B32: {
5685 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5686 return MI.getOperand(1).isReg() ||
5687 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5688 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5689 }
5690 case AMDGPU::S_ADD_I32:
5691 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5692 case AMDGPU::S_ADDC_U32:
5693 return AMDGPU::V_ADDC_U32_e32;
5694 case AMDGPU::S_SUB_I32:
5695 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5696 // FIXME: These are not consistently handled, and selected when the carry is
5697 // used.
5698 case AMDGPU::S_ADD_U32:
5699 return AMDGPU::V_ADD_CO_U32_e32;
5700 case AMDGPU::S_SUB_U32:
5701 return AMDGPU::V_SUB_CO_U32_e32;
5702 case AMDGPU::S_ADD_U64_PSEUDO:
5703 return AMDGPU::V_ADD_U64_PSEUDO;
5704 case AMDGPU::S_SUB_U64_PSEUDO:
5705 return AMDGPU::V_SUB_U64_PSEUDO;
5706 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5707 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5708 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5709 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5710 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5711 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5712 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5713 case AMDGPU::S_XNOR_B32:
5714 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5715 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5716 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5717 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5718 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5719 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5720 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5721 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5722 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5723 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5724 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5725 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5726 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5727 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5728 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5729 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5730 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5731 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5732 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5733 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5734 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5735 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5736 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5737 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5738 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5739 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5740 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5741 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5742 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5743 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5744 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5745 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5746 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5747 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5748 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5749 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5750 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5751 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5752 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5753 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5754 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5755 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5756 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5757 case AMDGPU::S_CVT_F32_F16:
5758 case AMDGPU::S_CVT_HI_F32_F16:
5759 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5760 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5761 case AMDGPU::S_CVT_F16_F32:
5762 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5763 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5764 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5765 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5766 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5767 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5768 case AMDGPU::S_CEIL_F16:
5769 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5770 : AMDGPU::V_CEIL_F16_fake16_e64;
5771 case AMDGPU::S_FLOOR_F16:
5772 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5773 : AMDGPU::V_FLOOR_F16_fake16_e64;
5774 case AMDGPU::S_TRUNC_F16:
5775 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
5776 : AMDGPU::V_TRUNC_F16_fake16_e64;
5777 case AMDGPU::S_RNDNE_F16:
5778 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
5779 : AMDGPU::V_RNDNE_F16_fake16_e64;
5780 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5781 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5782 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5783 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5784 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5785 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5786 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5787 case AMDGPU::S_ADD_F16:
5788 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
5789 : AMDGPU::V_ADD_F16_fake16_e64;
5790 case AMDGPU::S_SUB_F16:
5791 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
5792 : AMDGPU::V_SUB_F16_fake16_e64;
5793 case AMDGPU::S_MIN_F16:
5794 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
5795 : AMDGPU::V_MIN_F16_fake16_e64;
5796 case AMDGPU::S_MAX_F16:
5797 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
5798 : AMDGPU::V_MAX_F16_fake16_e64;
5799 case AMDGPU::S_MINIMUM_F16:
5800 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5801 : AMDGPU::V_MINIMUM_F16_fake16_e64;
5802 case AMDGPU::S_MAXIMUM_F16:
5803 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5804 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
5805 case AMDGPU::S_MUL_F16:
5806 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
5807 : AMDGPU::V_MUL_F16_fake16_e64;
5808 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5809 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5810 case AMDGPU::S_FMAC_F16:
5811 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5812 : AMDGPU::V_FMAC_F16_fake16_e64;
5813 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5814 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5815 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5816 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5817 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5818 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5819 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5820 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5821 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5822 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5823 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5824 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5825 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5826 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5827 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5828 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5829 case AMDGPU::S_CMP_LT_F16:
5830 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5831 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5832 case AMDGPU::S_CMP_EQ_F16:
5833 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5834 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5835 case AMDGPU::S_CMP_LE_F16:
5836 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5837 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5838 case AMDGPU::S_CMP_GT_F16:
5839 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5840 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5841 case AMDGPU::S_CMP_LG_F16:
5842 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5843 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5844 case AMDGPU::S_CMP_GE_F16:
5845 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5846 : AMDGPU::V_CMP_GE_F16_fake16_e64;
5847 case AMDGPU::S_CMP_O_F16:
5848 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5849 : AMDGPU::V_CMP_O_F16_fake16_e64;
5850 case AMDGPU::S_CMP_U_F16:
5851 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5852 : AMDGPU::V_CMP_U_F16_fake16_e64;
5853 case AMDGPU::S_CMP_NGE_F16:
5854 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5855 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5856 case AMDGPU::S_CMP_NLG_F16:
5857 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5858 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5859 case AMDGPU::S_CMP_NGT_F16:
5860 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5861 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
5862 case AMDGPU::S_CMP_NLE_F16:
5863 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5864 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
5865 case AMDGPU::S_CMP_NEQ_F16:
5866 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5867 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
5868 case AMDGPU::S_CMP_NLT_F16:
5869 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5870 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
5871 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5872 case AMDGPU::V_S_EXP_F16_e64:
5873 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
5874 : AMDGPU::V_EXP_F16_fake16_e64;
5875 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5876 case AMDGPU::V_S_LOG_F16_e64:
5877 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
5878 : AMDGPU::V_LOG_F16_fake16_e64;
5879 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5880 case AMDGPU::V_S_RCP_F16_e64:
5881 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
5882 : AMDGPU::V_RCP_F16_fake16_e64;
5883 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5884 case AMDGPU::V_S_RSQ_F16_e64:
5885 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
5886 : AMDGPU::V_RSQ_F16_fake16_e64;
5887 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5888 case AMDGPU::V_S_SQRT_F16_e64:
5889 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
5890 : AMDGPU::V_SQRT_F16_fake16_e64;
5891 }
5893 "Unexpected scalar opcode without corresponding vector one!");
5894}
5895
5896// clang-format on
5897
5901 const DebugLoc &DL, Register Reg,
5902 bool IsSCCLive,
5903 SlotIndexes *Indexes) const {
5904 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5905 const SIInstrInfo *TII = ST.getInstrInfo();
5906 bool IsWave32 = ST.isWave32();
5907 if (IsSCCLive) {
5908 // Insert two move instructions, one to save the original value of EXEC and
5909 // the other to turn on all bits in EXEC. This is required as we can't use
5910 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5911 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5912 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5913 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5914 .addReg(Exec, RegState::Kill);
5915 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5916 if (Indexes) {
5917 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5918 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5919 }
5920 } else {
5921 const unsigned OrSaveExec =
5922 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5923 auto SaveExec =
5924 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5925 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5926 if (Indexes)
5927 Indexes->insertMachineInstrInMaps(*SaveExec);
5928 }
5929}
5930
5933 const DebugLoc &DL, Register Reg,
5934 SlotIndexes *Indexes) const {
5935 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5936 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5937 auto ExecRestoreMI =
5938 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5939 if (Indexes)
5940 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5941}
5942
5946 "Not a whole wave func");
5947 MachineBasicBlock &MBB = *MF.begin();
5948 for (MachineInstr &MI : MBB)
5949 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
5950 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
5951 return &MI;
5952
5953 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
5954}
5955
5956static const TargetRegisterClass *
5958 const MCInstrDesc &TID, unsigned RCID,
5959 bool IsAllocatable) {
5960 if ((IsAllocatable || !ST.hasGFX90AInsts()) &&
5961 (((TID.mayLoad() || TID.mayStore()) &&
5962 !(TID.TSFlags & SIInstrFlags::Spill)) ||
5964 switch (RCID) {
5965 case AMDGPU::AV_32RegClassID:
5966 RCID = AMDGPU::VGPR_32RegClassID;
5967 break;
5968 case AMDGPU::AV_64RegClassID:
5969 RCID = AMDGPU::VReg_64RegClassID;
5970 break;
5971 case AMDGPU::AV_96RegClassID:
5972 RCID = AMDGPU::VReg_96RegClassID;
5973 break;
5974 case AMDGPU::AV_128RegClassID:
5975 RCID = AMDGPU::VReg_128RegClassID;
5976 break;
5977 case AMDGPU::AV_160RegClassID:
5978 RCID = AMDGPU::VReg_160RegClassID;
5979 break;
5980 case AMDGPU::AV_512RegClassID:
5981 RCID = AMDGPU::VReg_512RegClassID;
5982 break;
5983 default:
5984 break;
5985 }
5986 }
5987
5988 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
5989}
5990
5992 unsigned OpNum, const TargetRegisterInfo *TRI,
5993 const MachineFunction &MF)
5994 const {
5995 if (OpNum >= TID.getNumOperands())
5996 return nullptr;
5997 auto RegClass = TID.operands()[OpNum].RegClass;
5998 bool IsAllocatable = false;
6000 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
6001 // with two data operands. Request register class constrained to VGPR only
6002 // of both operands present as Machine Copy Propagation can not check this
6003 // constraint and possibly other passes too.
6004 //
6005 // The check is limited to FLAT and DS because atomics in non-flat encoding
6006 // have their vdst and vdata tied to be the same register.
6007 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
6008 AMDGPU::OpName::vdst);
6009 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
6010 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
6011 : AMDGPU::OpName::vdata);
6012 if (DataIdx != -1) {
6013 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
6014 TID.Opcode, AMDGPU::OpName::data1);
6015 }
6016 }
6017 return adjustAllocatableRegClass(ST, RI, TID, RegClass, IsAllocatable);
6018}
6019
6021 unsigned OpNo) const {
6022 const MCInstrDesc &Desc = get(MI.getOpcode());
6023 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6024 Desc.operands()[OpNo].RegClass == -1) {
6025 Register Reg = MI.getOperand(OpNo).getReg();
6026
6027 if (Reg.isVirtual()) {
6028 const MachineRegisterInfo &MRI =
6029 MI.getParent()->getParent()->getRegInfo();
6030 return MRI.getRegClass(Reg);
6031 }
6032 return RI.getPhysRegBaseClass(Reg);
6033 }
6034
6035 unsigned RCID = Desc.operands()[OpNo].RegClass;
6036 return adjustAllocatableRegClass(ST, RI, Desc, RCID, true);
6037}
6038
6041 MachineBasicBlock *MBB = MI.getParent();
6042 MachineOperand &MO = MI.getOperand(OpIdx);
6044 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
6045 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6046 unsigned Size = RI.getRegSizeInBits(*RC);
6047 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6048 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6049 : AMDGPU::V_MOV_B32_e32;
6050 if (MO.isReg())
6051 Opcode = AMDGPU::COPY;
6052 else if (RI.isSGPRClass(RC))
6053 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6054
6055 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6056 Register Reg = MRI.createVirtualRegister(VRC);
6058 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6059 MO.ChangeToRegister(Reg, false);
6060}
6061
6064 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6065 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6066 if (!SuperReg.getReg().isVirtual())
6067 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6068
6069 MachineBasicBlock *MBB = MI->getParent();
6070 const DebugLoc &DL = MI->getDebugLoc();
6071 Register SubReg = MRI.createVirtualRegister(SubRC);
6072
6073 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6074 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6075 .addReg(SuperReg.getReg(), 0, NewSubIdx);
6076 return SubReg;
6077}
6078
6081 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6082 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6083 if (Op.isImm()) {
6084 if (SubIdx == AMDGPU::sub0)
6085 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6086 if (SubIdx == AMDGPU::sub1)
6087 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6088
6089 llvm_unreachable("Unhandled register index for immediate");
6090 }
6091
6092 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6093 SubIdx, SubRC);
6094 return MachineOperand::CreateReg(SubReg, false);
6095}
6096
6097// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6098void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6099 assert(Inst.getNumExplicitOperands() == 3);
6100 MachineOperand Op1 = Inst.getOperand(1);
6101 Inst.removeOperand(1);
6102 Inst.addOperand(Op1);
6103}
6104
6106 const MCOperandInfo &OpInfo,
6107 const MachineOperand &MO) const {
6108 if (!MO.isReg())
6109 return false;
6110
6111 Register Reg = MO.getReg();
6112
6113 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
6114 if (Reg.isPhysical())
6115 return DRC->contains(Reg);
6116
6117 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6118
6119 if (MO.getSubReg()) {
6120 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
6121 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6122 if (!SuperRC)
6123 return false;
6124
6125 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
6126 if (!DRC)
6127 return false;
6128 }
6129 return RC->hasSuperClassEq(DRC);
6130}
6131
6133 const MachineOperand &MO) const {
6134 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6135 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6136 unsigned Opc = MI.getOpcode();
6137
6138 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6139 // information.
6140 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6141 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6142 constexpr const AMDGPU::OpName OpNames[] = {
6143 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6144
6145 for (auto [I, OpName] : enumerate(OpNames)) {
6146 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6147 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6149 return false;
6150 }
6151 }
6152
6153 if (!isLegalRegOperand(MRI, OpInfo, MO))
6154 return false;
6155
6156 // check Accumulate GPR operand
6157 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6158 if (IsAGPR && !ST.hasMAIInsts())
6159 return false;
6160 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6161 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6162 return false;
6163 // Atomics should have both vdst and vdata either vgpr or agpr.
6164 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6165 const int DataIdx = AMDGPU::getNamedOperandIdx(
6166 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6167 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6168 MI.getOperand(DataIdx).isReg() &&
6169 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6170 return false;
6171 if ((int)OpIdx == DataIdx) {
6172 if (VDstIdx != -1 &&
6173 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6174 return false;
6175 // DS instructions with 2 src operands also must have tied RC.
6176 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6177 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6178 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6179 return false;
6180 }
6181
6182 // Check V_ACCVGPR_WRITE_B32_e64
6183 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6184 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6185 RI.isSGPRReg(MRI, MO.getReg()))
6186 return false;
6187 return true;
6188}
6189
6191 const MCOperandInfo &OpInfo,
6192 const MachineOperand &MO) const {
6193 if (MO.isReg())
6194 return isLegalRegOperand(MRI, OpInfo, MO);
6195
6196 // Handle non-register types that are treated like immediates.
6197 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6198 return true;
6199}
6200
6202 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6203 const MachineOperand *MO) const {
6204 constexpr const unsigned NumOps = 3;
6205 constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
6206 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6207 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6208 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6209
6210 assert(SrcN < NumOps);
6211
6212 if (!MO) {
6213 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6214 if (SrcIdx == -1)
6215 return true;
6216 MO = &MI.getOperand(SrcIdx);
6217 }
6218
6219 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6220 return true;
6221
6222 int ModsIdx =
6223 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6224 if (ModsIdx == -1)
6225 return true;
6226
6227 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6228 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6229 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6230
6231 return !OpSel && !OpSelHi;
6232}
6233
6235 const MachineOperand *MO) const {
6236 const MachineFunction &MF = *MI.getParent()->getParent();
6237 const MachineRegisterInfo &MRI = MF.getRegInfo();
6238 const MCInstrDesc &InstDesc = MI.getDesc();
6239 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6240 const TargetRegisterClass *DefinedRC =
6241 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
6242 if (!MO)
6243 MO = &MI.getOperand(OpIdx);
6244
6245 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6246
6247 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6248 const MachineOperand *UsedLiteral = nullptr;
6249
6250 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6251 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6252
6253 // TODO: Be more permissive with frame indexes.
6254 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6255 if (!LiteralLimit--)
6256 return false;
6257
6258 UsedLiteral = MO;
6259 }
6260
6262 if (MO->isReg())
6263 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6264
6265 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6266 if (i == OpIdx)
6267 continue;
6268 const MachineOperand &Op = MI.getOperand(i);
6269 if (Op.isReg()) {
6270 if (Op.isUse()) {
6271 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6272 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6273 if (--ConstantBusLimit <= 0)
6274 return false;
6275 }
6276 }
6277 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6278 !isInlineConstant(Op, InstDesc.operands()[i])) {
6279 // The same literal may be used multiple times.
6280 if (!UsedLiteral)
6281 UsedLiteral = &Op;
6282 else if (UsedLiteral->isIdenticalTo(Op))
6283 continue;
6284
6285 if (!LiteralLimit--)
6286 return false;
6287 if (--ConstantBusLimit <= 0)
6288 return false;
6289 }
6290 }
6291 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6292 // There can be at most one literal operand, but it can be repeated.
6293 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6294 if (i == OpIdx)
6295 continue;
6296 const MachineOperand &Op = MI.getOperand(i);
6297 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6298 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6299 !Op.isIdenticalTo(*MO))
6300 return false;
6301
6302 // Do not fold a non-inlineable and non-register operand into an
6303 // instruction that already has a frame index. The frame index handling
6304 // code could not handle well when a frame index co-exists with another
6305 // non-register operand, unless that operand is an inlineable immediate.
6306 if (Op.isFI())
6307 return false;
6308 }
6309 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6310 isF16PseudoScalarTrans(MI.getOpcode())) {
6311 return false;
6312 }
6313
6314 if (MO->isReg()) {
6315 if (!DefinedRC)
6316 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6317 return isLegalRegOperand(MI, OpIdx, *MO);
6318 }
6319
6320 if (MO->isImm()) {
6321 uint64_t Imm = MO->getImm();
6322 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6323 bool Is64BitOp = Is64BitFPOp ||
6327 if (Is64BitOp &&
6329 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6330 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6331 return false;
6332
6333 // FIXME: We can use sign extended 64-bit literals, but only for signed
6334 // operands. At the moment we do not know if an operand is signed.
6335 // Such operand will be encoded as its low 32 bits and then either
6336 // correctly sign extended or incorrectly zero extended by HW.
6337 // If 64-bit literals are supported and the literal will be encoded
6338 // as full 64 bit we still can use it.
6339 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6340 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6341 return false;
6342 }
6343 }
6344
6345 // Handle non-register types that are treated like immediates.
6346 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6347
6348 if (!DefinedRC) {
6349 // This operand expects an immediate.
6350 return true;
6351 }
6352
6353 return isImmOperandLegal(MI, OpIdx, *MO);
6354}
6355
6357 MachineInstr &MI) const {
6358 unsigned Opc = MI.getOpcode();
6359 const MCInstrDesc &InstrDesc = get(Opc);
6360
6361 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6362 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6363
6364 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6365 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6366
6367 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6368 // we need to only have one constant bus use before GFX10.
6369 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6370 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6371 RI.isSGPRReg(MRI, Src0.getReg()))
6372 legalizeOpWithMove(MI, Src0Idx);
6373
6374 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6375 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6376 // src0/src1 with V_READFIRSTLANE.
6377 if (Opc == AMDGPU::V_WRITELANE_B32) {
6378 const DebugLoc &DL = MI.getDebugLoc();
6379 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6380 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6381 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6382 .add(Src0);
6383 Src0.ChangeToRegister(Reg, false);
6384 }
6385 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6386 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6387 const DebugLoc &DL = MI.getDebugLoc();
6388 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6389 .add(Src1);
6390 Src1.ChangeToRegister(Reg, false);
6391 }
6392 return;
6393 }
6394
6395 // No VOP2 instructions support AGPRs.
6396 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
6397 legalizeOpWithMove(MI, Src0Idx);
6398
6399 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
6400 legalizeOpWithMove(MI, Src1Idx);
6401
6402 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6403 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6404 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6405 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6406 legalizeOpWithMove(MI, Src2Idx);
6407 }
6408
6409 // VOP2 src0 instructions support all operand types, so we don't need to check
6410 // their legality. If src1 is already legal, we don't need to do anything.
6411 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6412 return;
6413
6414 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6415 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6416 // select is uniform.
6417 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6418 RI.isVGPR(MRI, Src1.getReg())) {
6419 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6420 const DebugLoc &DL = MI.getDebugLoc();
6421 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6422 .add(Src1);
6423 Src1.ChangeToRegister(Reg, false);
6424 return;
6425 }
6426
6427 // We do not use commuteInstruction here because it is too aggressive and will
6428 // commute if it is possible. We only want to commute here if it improves
6429 // legality. This can be called a fairly large number of times so don't waste
6430 // compile time pointlessly swapping and checking legality again.
6431 if (HasImplicitSGPR || !MI.isCommutable()) {
6432 legalizeOpWithMove(MI, Src1Idx);
6433 return;
6434 }
6435
6436 // If src0 can be used as src1, commuting will make the operands legal.
6437 // Otherwise we have to give up and insert a move.
6438 //
6439 // TODO: Other immediate-like operand kinds could be commuted if there was a
6440 // MachineOperand::ChangeTo* for them.
6441 if ((!Src1.isImm() && !Src1.isReg()) ||
6442 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6443 legalizeOpWithMove(MI, Src1Idx);
6444 return;
6445 }
6446
6447 int CommutedOpc = commuteOpcode(MI);
6448 if (CommutedOpc == -1) {
6449 legalizeOpWithMove(MI, Src1Idx);
6450 return;
6451 }
6452
6453 MI.setDesc(get(CommutedOpc));
6454
6455 Register Src0Reg = Src0.getReg();
6456 unsigned Src0SubReg = Src0.getSubReg();
6457 bool Src0Kill = Src0.isKill();
6458
6459 if (Src1.isImm())
6460 Src0.ChangeToImmediate(Src1.getImm());
6461 else if (Src1.isReg()) {
6462 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6463 Src0.setSubReg(Src1.getSubReg());
6464 } else
6465 llvm_unreachable("Should only have register or immediate operands");
6466
6467 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6468 Src1.setSubReg(Src0SubReg);
6470}
6471
6472// Legalize VOP3 operands. All operand types are supported for any operand
6473// but only one literal constant and only starting from GFX10.
6475 MachineInstr &MI) const {
6476 unsigned Opc = MI.getOpcode();
6477
6478 int VOP3Idx[3] = {
6479 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6480 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6481 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6482 };
6483
6484 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6485 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6486 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6487 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6488 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6489 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6490 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6491 // src1 and src2 must be scalar
6492 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6493 const DebugLoc &DL = MI.getDebugLoc();
6494 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6495 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6496 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6497 .add(Src1);
6498 Src1.ChangeToRegister(Reg, false);
6499 }
6500 if (VOP3Idx[2] != -1) {
6501 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6502 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6503 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6504 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6505 .add(Src2);
6506 Src2.ChangeToRegister(Reg, false);
6507 }
6508 }
6509 }
6510
6511 // Find the one SGPR operand we are allowed to use.
6512 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6513 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6514 SmallDenseSet<unsigned> SGPRsUsed;
6515 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6516 if (SGPRReg) {
6517 SGPRsUsed.insert(SGPRReg);
6518 --ConstantBusLimit;
6519 }
6520
6521 for (int Idx : VOP3Idx) {
6522 if (Idx == -1)
6523 break;
6524 MachineOperand &MO = MI.getOperand(Idx);
6525
6526 if (!MO.isReg()) {
6527 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6528 continue;
6529
6530 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6531 --LiteralLimit;
6532 --ConstantBusLimit;
6533 continue;
6534 }
6535
6536 --LiteralLimit;
6537 --ConstantBusLimit;
6539 continue;
6540 }
6541
6542 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
6543 !isOperandLegal(MI, Idx, &MO)) {
6545 continue;
6546 }
6547
6548 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6549 continue; // VGPRs are legal
6550
6551 // We can use one SGPR in each VOP3 instruction prior to GFX10
6552 // and two starting from GFX10.
6553 if (SGPRsUsed.count(MO.getReg()))
6554 continue;
6555 if (ConstantBusLimit > 0) {
6556 SGPRsUsed.insert(MO.getReg());
6557 --ConstantBusLimit;
6558 continue;
6559 }
6560
6561 // If we make it this far, then the operand is not legal and we must
6562 // legalize it.
6564 }
6565
6566 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6567 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6568 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6569 legalizeOpWithMove(MI, VOP3Idx[2]);
6570
6571 // Fix the register class of packed FP32 instructions on gfx12+. See
6572 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6574 for (unsigned I = 0; I < 3; ++I) {
6576 legalizeOpWithMove(MI, VOP3Idx[I]);
6577 }
6578 }
6579}
6580
6583 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6584 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6585 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6586 if (DstRC)
6587 SRC = RI.getCommonSubClass(SRC, DstRC);
6588
6589 Register DstReg = MRI.createVirtualRegister(SRC);
6590 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6591
6592 if (RI.hasAGPRs(VRC)) {
6593 VRC = RI.getEquivalentVGPRClass(VRC);
6594 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6595 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6596 get(TargetOpcode::COPY), NewSrcReg)
6597 .addReg(SrcReg);
6598 SrcReg = NewSrcReg;
6599 }
6600
6601 if (SubRegs == 1) {
6602 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6603 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6604 .addReg(SrcReg);
6605 return DstReg;
6606 }
6607
6609 for (unsigned i = 0; i < SubRegs; ++i) {
6610 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6611 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6612 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6613 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6614 SRegs.push_back(SGPR);
6615 }
6616
6618 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6619 get(AMDGPU::REG_SEQUENCE), DstReg);
6620 for (unsigned i = 0; i < SubRegs; ++i) {
6621 MIB.addReg(SRegs[i]);
6622 MIB.addImm(RI.getSubRegFromChannel(i));
6623 }
6624 return DstReg;
6625}
6626
6628 MachineInstr &MI) const {
6629
6630 // If the pointer is store in VGPRs, then we need to move them to
6631 // SGPRs using v_readfirstlane. This is safe because we only select
6632 // loads with uniform pointers to SMRD instruction so we know the
6633 // pointer value is uniform.
6634 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6635 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6636 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6637 SBase->setReg(SGPR);
6638 }
6639 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6640 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6641 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6642 SOff->setReg(SGPR);
6643 }
6644}
6645
6647 unsigned Opc = Inst.getOpcode();
6648 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6649 if (OldSAddrIdx < 0)
6650 return false;
6651
6652 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6653
6654 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6655 if (NewOpc < 0)
6657 if (NewOpc < 0)
6658 return false;
6659
6661 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6662 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6663 return false;
6664
6665 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6666 if (NewVAddrIdx < 0)
6667 return false;
6668
6669 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6670
6671 // Check vaddr, it shall be zero or absent.
6672 MachineInstr *VAddrDef = nullptr;
6673 if (OldVAddrIdx >= 0) {
6674 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6675 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6676 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6677 !VAddrDef->getOperand(1).isImm() ||
6678 VAddrDef->getOperand(1).getImm() != 0)
6679 return false;
6680 }
6681
6682 const MCInstrDesc &NewDesc = get(NewOpc);
6683 Inst.setDesc(NewDesc);
6684
6685 // Callers expect iterator to be valid after this call, so modify the
6686 // instruction in place.
6687 if (OldVAddrIdx == NewVAddrIdx) {
6688 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6689 // Clear use list from the old vaddr holding a zero register.
6690 MRI.removeRegOperandFromUseList(&NewVAddr);
6691 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6692 Inst.removeOperand(OldSAddrIdx);
6693 // Update the use list with the pointer we have just moved from vaddr to
6694 // saddr position. Otherwise new vaddr will be missing from the use list.
6695 MRI.removeRegOperandFromUseList(&NewVAddr);
6696 MRI.addRegOperandToUseList(&NewVAddr);
6697 } else {
6698 assert(OldSAddrIdx == NewVAddrIdx);
6699
6700 if (OldVAddrIdx >= 0) {
6701 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6702 AMDGPU::OpName::vdst_in);
6703
6704 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6705 // it asserts. Untie the operands for now and retie them afterwards.
6706 if (NewVDstIn != -1) {
6707 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6708 Inst.untieRegOperand(OldVDstIn);
6709 }
6710
6711 Inst.removeOperand(OldVAddrIdx);
6712
6713 if (NewVDstIn != -1) {
6714 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6715 Inst.tieOperands(NewVDst, NewVDstIn);
6716 }
6717 }
6718 }
6719
6720 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6721 VAddrDef->eraseFromParent();
6722
6723 return true;
6724}
6725
6726// FIXME: Remove this when SelectionDAG is obsoleted.
6728 MachineInstr &MI) const {
6730 return;
6731
6732 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6733 // thinks they are uniform, so a readfirstlane should be valid.
6734 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6735 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6736 return;
6737
6739 return;
6740
6741 const TargetRegisterClass *DeclaredRC = getRegClass(
6742 MI.getDesc(), SAddr->getOperandNo(), &RI, *MI.getParent()->getParent());
6743
6744 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6745 SAddr->setReg(ToSGPR);
6746}
6747
6750 const TargetRegisterClass *DstRC,
6753 const DebugLoc &DL) const {
6754 Register OpReg = Op.getReg();
6755 unsigned OpSubReg = Op.getSubReg();
6756
6757 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6758 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6759
6760 // Check if operand is already the correct register class.
6761 if (DstRC == OpRC)
6762 return;
6763
6764 Register DstReg = MRI.createVirtualRegister(DstRC);
6765 auto Copy =
6766 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6767 Op.setReg(DstReg);
6768
6769 MachineInstr *Def = MRI.getVRegDef(OpReg);
6770 if (!Def)
6771 return;
6772
6773 // Try to eliminate the copy if it is copying an immediate value.
6774 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6775 foldImmediate(*Copy, *Def, OpReg, &MRI);
6776
6777 bool ImpDef = Def->isImplicitDef();
6778 while (!ImpDef && Def && Def->isCopy()) {
6779 if (Def->getOperand(1).getReg().isPhysical())
6780 break;
6781 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6782 ImpDef = Def && Def->isImplicitDef();
6783 }
6784 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6785 !ImpDef)
6786 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6787}
6788
6789// Emit the actual waterfall loop, executing the wrapped instruction for each
6790// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6791// iteration, in the worst case we execute 64 (once per lane).
6792static void
6795 MachineBasicBlock &LoopBB,
6796 MachineBasicBlock &BodyBB,
6797 const DebugLoc &DL,
6798 ArrayRef<MachineOperand *> ScalarOps) {
6799 MachineFunction &MF = *LoopBB.getParent();
6800 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6801 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6802 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6803 unsigned SaveExecOpc =
6804 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6805 unsigned XorTermOpc =
6806 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6807 unsigned AndOpc =
6808 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6809 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6810
6812 Register CondReg;
6813
6814 for (MachineOperand *ScalarOp : ScalarOps) {
6815 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6816 unsigned NumSubRegs = RegSize / 32;
6817 Register VScalarOp = ScalarOp->getReg();
6818
6819 if (NumSubRegs == 1) {
6820 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6821
6822 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6823 .addReg(VScalarOp);
6824
6825 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6826
6827 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6828 .addReg(CurReg)
6829 .addReg(VScalarOp);
6830
6831 // Combine the comparison results with AND.
6832 if (!CondReg) // First.
6833 CondReg = NewCondReg;
6834 else { // If not the first, we create an AND.
6835 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6836 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6837 .addReg(CondReg)
6838 .addReg(NewCondReg);
6839 CondReg = AndReg;
6840 }
6841
6842 // Update ScalarOp operand to use the SGPR ScalarOp.
6843 ScalarOp->setReg(CurReg);
6844 ScalarOp->setIsKill();
6845 } else {
6846 SmallVector<Register, 8> ReadlanePieces;
6847 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6848 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6849 "Unhandled register size");
6850
6851 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6852 Register CurRegLo =
6853 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6854 Register CurRegHi =
6855 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6856
6857 // Read the next variant <- also loop target.
6858 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6859 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6860
6861 // Read the next variant <- also loop target.
6862 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6863 .addReg(VScalarOp, VScalarOpUndef,
6864 TRI->getSubRegFromChannel(Idx + 1));
6865
6866 ReadlanePieces.push_back(CurRegLo);
6867 ReadlanePieces.push_back(CurRegHi);
6868
6869 // Comparison is to be done as 64-bit.
6870 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6871 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6872 .addReg(CurRegLo)
6873 .addImm(AMDGPU::sub0)
6874 .addReg(CurRegHi)
6875 .addImm(AMDGPU::sub1);
6876
6877 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6878 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6879 NewCondReg)
6880 .addReg(CurReg);
6881 if (NumSubRegs <= 2)
6882 Cmp.addReg(VScalarOp);
6883 else
6884 Cmp.addReg(VScalarOp, VScalarOpUndef,
6885 TRI->getSubRegFromChannel(Idx, 2));
6886
6887 // Combine the comparison results with AND.
6888 if (!CondReg) // First.
6889 CondReg = NewCondReg;
6890 else { // If not the first, we create an AND.
6891 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6892 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6893 .addReg(CondReg)
6894 .addReg(NewCondReg);
6895 CondReg = AndReg;
6896 }
6897 } // End for loop.
6898
6899 const auto *SScalarOpRC =
6900 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6901 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6902
6903 // Build scalar ScalarOp.
6904 auto Merge =
6905 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6906 unsigned Channel = 0;
6907 for (Register Piece : ReadlanePieces) {
6908 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6909 }
6910
6911 // Update ScalarOp operand to use the SGPR ScalarOp.
6912 ScalarOp->setReg(SScalarOp);
6913 ScalarOp->setIsKill();
6914 }
6915 }
6916
6917 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6918 MRI.setSimpleHint(SaveExec, CondReg);
6919
6920 // Update EXEC to matching lanes, saving original to SaveExec.
6921 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6922 .addReg(CondReg, RegState::Kill);
6923
6924 // The original instruction is here; we insert the terminators after it.
6925 I = BodyBB.end();
6926
6927 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6928 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6929 .addReg(Exec)
6930 .addReg(SaveExec);
6931
6932 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6933}
6934
6935// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6936// with SGPRs by iterating over all unique values across all lanes.
6937// Returns the loop basic block that now contains \p MI.
6938static MachineBasicBlock *
6942 MachineBasicBlock::iterator Begin = nullptr,
6943 MachineBasicBlock::iterator End = nullptr) {
6944 MachineBasicBlock &MBB = *MI.getParent();
6945 MachineFunction &MF = *MBB.getParent();
6946 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6947 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6949 if (!Begin.isValid())
6950 Begin = &MI;
6951 if (!End.isValid()) {
6952 End = &MI;
6953 ++End;
6954 }
6955 const DebugLoc &DL = MI.getDebugLoc();
6956 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6957 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6958 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6959
6960 // Save SCC. Waterfall Loop may overwrite SCC.
6961 Register SaveSCCReg;
6962
6963 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
6964 // rather than unlimited scan everywhere
6965 bool SCCNotDead =
6966 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
6967 std::numeric_limits<unsigned>::max()) !=
6969 if (SCCNotDead) {
6970 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6971 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6972 .addImm(1)
6973 .addImm(0);
6974 }
6975
6976 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6977
6978 // Save the EXEC mask
6979 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
6980
6981 // Killed uses in the instruction we are waterfalling around will be
6982 // incorrect due to the added control-flow.
6984 ++AfterMI;
6985 for (auto I = Begin; I != AfterMI; I++) {
6986 for (auto &MO : I->all_uses())
6987 MRI.clearKillFlags(MO.getReg());
6988 }
6989
6990 // To insert the loop we need to split the block. Move everything after this
6991 // point to a new block, and insert a new empty block between the two.
6994 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6996 ++MBBI;
6997
6998 MF.insert(MBBI, LoopBB);
6999 MF.insert(MBBI, BodyBB);
7000 MF.insert(MBBI, RemainderBB);
7001
7002 LoopBB->addSuccessor(BodyBB);
7003 BodyBB->addSuccessor(LoopBB);
7004 BodyBB->addSuccessor(RemainderBB);
7005
7006 // Move Begin to MI to the BodyBB, and the remainder of the block to
7007 // RemainderBB.
7008 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7009 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7010 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7011
7012 MBB.addSuccessor(LoopBB);
7013
7014 // Update dominators. We know that MBB immediately dominates LoopBB, that
7015 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7016 // RemainderBB. RemainderBB immediately dominates all of the successors
7017 // transferred to it from MBB that MBB used to properly dominate.
7018 if (MDT) {
7019 MDT->addNewBlock(LoopBB, &MBB);
7020 MDT->addNewBlock(BodyBB, LoopBB);
7021 MDT->addNewBlock(RemainderBB, BodyBB);
7022 for (auto &Succ : RemainderBB->successors()) {
7023 if (MDT->properlyDominates(&MBB, Succ)) {
7024 MDT->changeImmediateDominator(Succ, RemainderBB);
7025 }
7026 }
7027 }
7028
7029 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7030
7031 MachineBasicBlock::iterator First = RemainderBB->begin();
7032 // Restore SCC
7033 if (SCCNotDead) {
7034 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7035 .addReg(SaveSCCReg, RegState::Kill)
7036 .addImm(0);
7037 }
7038
7039 // Restore the EXEC mask
7040 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
7041 return BodyBB;
7042}
7043
7044// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7045static std::tuple<unsigned, unsigned>
7047 MachineBasicBlock &MBB = *MI.getParent();
7048 MachineFunction &MF = *MBB.getParent();
7050
7051 // Extract the ptr from the resource descriptor.
7052 unsigned RsrcPtr =
7053 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7054 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7055
7056 // Create an empty resource descriptor
7057 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7058 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7059 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7060 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7061 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7062
7063 // Zero64 = 0
7064 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7065 .addImm(0);
7066
7067 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7068 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7069 .addImm(Lo_32(RsrcDataFormat));
7070
7071 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7072 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7073 .addImm(Hi_32(RsrcDataFormat));
7074
7075 // NewSRsrc = {Zero64, SRsrcFormat}
7076 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7077 .addReg(Zero64)
7078 .addImm(AMDGPU::sub0_sub1)
7079 .addReg(SRsrcFormatLo)
7080 .addImm(AMDGPU::sub2)
7081 .addReg(SRsrcFormatHi)
7082 .addImm(AMDGPU::sub3);
7083
7084 return std::tuple(RsrcPtr, NewSRsrc);
7085}
7086
7089 MachineDominatorTree *MDT) const {
7090 MachineFunction &MF = *MI.getParent()->getParent();
7092 MachineBasicBlock *CreatedBB = nullptr;
7093
7094 // Legalize VOP2
7095 if (isVOP2(MI) || isVOPC(MI)) {
7097 return CreatedBB;
7098 }
7099
7100 // Legalize VOP3
7101 if (isVOP3(MI)) {
7103 return CreatedBB;
7104 }
7105
7106 // Legalize SMRD
7107 if (isSMRD(MI)) {
7109 return CreatedBB;
7110 }
7111
7112 // Legalize FLAT
7113 if (isFLAT(MI)) {
7115 return CreatedBB;
7116 }
7117
7118 // Legalize REG_SEQUENCE and PHI
7119 // The register class of the operands much be the same type as the register
7120 // class of the output.
7121 if (MI.getOpcode() == AMDGPU::PHI) {
7122 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
7123 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
7124 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
7125 continue;
7126 const TargetRegisterClass *OpRC =
7127 MRI.getRegClass(MI.getOperand(i).getReg());
7128 if (RI.hasVectorRegisters(OpRC)) {
7129 VRC = OpRC;
7130 } else {
7131 SRC = OpRC;
7132 }
7133 }
7134
7135 // If any of the operands are VGPR registers, then they all most be
7136 // otherwise we will create illegal VGPR->SGPR copies when legalizing
7137 // them.
7138 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
7139 if (!VRC) {
7140 assert(SRC);
7141 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
7142 VRC = &AMDGPU::VReg_1RegClass;
7143 } else
7144 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7145 ? RI.getEquivalentAGPRClass(SRC)
7146 : RI.getEquivalentVGPRClass(SRC);
7147 } else {
7148 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7149 ? RI.getEquivalentAGPRClass(VRC)
7150 : RI.getEquivalentVGPRClass(VRC);
7151 }
7152 RC = VRC;
7153 } else {
7154 RC = SRC;
7155 }
7156
7157 // Update all the operands so they have the same type.
7158 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7159 MachineOperand &Op = MI.getOperand(I);
7160 if (!Op.isReg() || !Op.getReg().isVirtual())
7161 continue;
7162
7163 // MI is a PHI instruction.
7164 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7166
7167 // Avoid creating no-op copies with the same src and dst reg class. These
7168 // confuse some of the machine passes.
7169 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
7170 }
7171 }
7172
7173 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7174 // VGPR dest type and SGPR sources, insert copies so all operands are
7175 // VGPRs. This seems to help operand folding / the register coalescer.
7176 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7177 MachineBasicBlock *MBB = MI.getParent();
7178 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7179 if (RI.hasVGPRs(DstRC)) {
7180 // Update all the operands so they are VGPR register classes. These may
7181 // not be the same register class because REG_SEQUENCE supports mixing
7182 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7183 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7184 MachineOperand &Op = MI.getOperand(I);
7185 if (!Op.isReg() || !Op.getReg().isVirtual())
7186 continue;
7187
7188 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7189 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7190 if (VRC == OpRC)
7191 continue;
7192
7193 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7194 Op.setIsKill();
7195 }
7196 }
7197
7198 return CreatedBB;
7199 }
7200
7201 // Legalize INSERT_SUBREG
7202 // src0 must have the same register class as dst
7203 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7204 Register Dst = MI.getOperand(0).getReg();
7205 Register Src0 = MI.getOperand(1).getReg();
7206 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7207 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7208 if (DstRC != Src0RC) {
7209 MachineBasicBlock *MBB = MI.getParent();
7210 MachineOperand &Op = MI.getOperand(1);
7211 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7212 }
7213 return CreatedBB;
7214 }
7215
7216 // Legalize SI_INIT_M0
7217 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7218 MachineOperand &Src = MI.getOperand(0);
7219 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7220 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7221 return CreatedBB;
7222 }
7223
7224 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7225 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7226 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7227 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7228 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7229 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7230 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7231 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7232 MachineOperand &Src = MI.getOperand(1);
7233 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7234 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7235 return CreatedBB;
7236 }
7237
7238 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7239 //
7240 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7241 // scratch memory access. In both cases, the legalization never involves
7242 // conversion to the addr64 form.
7244 (isMUBUF(MI) || isMTBUF(MI)))) {
7245 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7246 ? AMDGPU::OpName::rsrc
7247 : AMDGPU::OpName::srsrc;
7248 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7249 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7250 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7251
7252 AMDGPU::OpName SampOpName =
7253 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7254 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7255 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7256 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7257
7258 return CreatedBB;
7259 }
7260
7261 // Legalize SI_CALL
7262 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7263 MachineOperand *Dest = &MI.getOperand(0);
7264 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7265 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7266 // following copies, we also need to move copies from and to physical
7267 // registers into the loop block.
7268 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7269 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7270
7271 // Also move the copies to physical registers into the loop block
7272 MachineBasicBlock &MBB = *MI.getParent();
7274 while (Start->getOpcode() != FrameSetupOpcode)
7275 --Start;
7277 while (End->getOpcode() != FrameDestroyOpcode)
7278 ++End;
7279 // Also include following copies of the return value
7280 ++End;
7281 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7282 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7283 ++End;
7284 CreatedBB =
7285 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7286 }
7287 }
7288
7289 // Legalize s_sleep_var.
7290 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7291 const DebugLoc &DL = MI.getDebugLoc();
7292 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7293 int Src0Idx =
7294 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7295 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7296 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7297 .add(Src0);
7298 Src0.ChangeToRegister(Reg, false);
7299 return nullptr;
7300 }
7301
7302 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7303 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7304 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7305 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7306 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7307 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7308 for (MachineOperand &Src : MI.explicit_operands()) {
7309 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7310 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7311 }
7312 return CreatedBB;
7313 }
7314
7315 // Legalize MUBUF instructions.
7316 bool isSoffsetLegal = true;
7317 int SoffsetIdx =
7318 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7319 if (SoffsetIdx != -1) {
7320 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7321 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7322 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7323 isSoffsetLegal = false;
7324 }
7325 }
7326
7327 bool isRsrcLegal = true;
7328 int RsrcIdx =
7329 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7330 if (RsrcIdx != -1) {
7331 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7332 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7333 isRsrcLegal = false;
7334 }
7335
7336 // The operands are legal.
7337 if (isRsrcLegal && isSoffsetLegal)
7338 return CreatedBB;
7339
7340 if (!isRsrcLegal) {
7341 // Legalize a VGPR Rsrc
7342 //
7343 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7344 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7345 // a zero-value SRsrc.
7346 //
7347 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7348 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7349 // above.
7350 //
7351 // Otherwise we are on non-ADDR64 hardware, and/or we have
7352 // idxen/offen/bothen and we fall back to a waterfall loop.
7353
7354 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7355 MachineBasicBlock &MBB = *MI.getParent();
7356
7357 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7358 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7359 // This is already an ADDR64 instruction so we need to add the pointer
7360 // extracted from the resource descriptor to the current value of VAddr.
7361 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7362 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7363 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7364
7365 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7366 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7367 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7368
7369 unsigned RsrcPtr, NewSRsrc;
7370 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7371
7372 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7373 const DebugLoc &DL = MI.getDebugLoc();
7374 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7375 .addDef(CondReg0)
7376 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7377 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7378 .addImm(0);
7379
7380 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7381 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7382 .addDef(CondReg1, RegState::Dead)
7383 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7384 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7385 .addReg(CondReg0, RegState::Kill)
7386 .addImm(0);
7387
7388 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7389 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7390 .addReg(NewVAddrLo)
7391 .addImm(AMDGPU::sub0)
7392 .addReg(NewVAddrHi)
7393 .addImm(AMDGPU::sub1);
7394
7395 VAddr->setReg(NewVAddr);
7396 Rsrc->setReg(NewSRsrc);
7397 } else if (!VAddr && ST.hasAddr64()) {
7398 // This instructions is the _OFFSET variant, so we need to convert it to
7399 // ADDR64.
7401 "FIXME: Need to emit flat atomics here");
7402
7403 unsigned RsrcPtr, NewSRsrc;
7404 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7405
7406 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7407 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7408 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7409 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7410 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7411
7412 // Atomics with return have an additional tied operand and are
7413 // missing some of the special bits.
7414 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7415 MachineInstr *Addr64;
7416
7417 if (!VDataIn) {
7418 // Regular buffer load / store.
7420 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7421 .add(*VData)
7422 .addReg(NewVAddr)
7423 .addReg(NewSRsrc)
7424 .add(*SOffset)
7425 .add(*Offset);
7426
7427 if (const MachineOperand *CPol =
7428 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7429 MIB.addImm(CPol->getImm());
7430 }
7431
7432 if (const MachineOperand *TFE =
7433 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7434 MIB.addImm(TFE->getImm());
7435 }
7436
7437 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7438
7439 MIB.cloneMemRefs(MI);
7440 Addr64 = MIB;
7441 } else {
7442 // Atomics with return.
7443 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7444 .add(*VData)
7445 .add(*VDataIn)
7446 .addReg(NewVAddr)
7447 .addReg(NewSRsrc)
7448 .add(*SOffset)
7449 .add(*Offset)
7450 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7451 .cloneMemRefs(MI);
7452 }
7453
7454 MI.removeFromParent();
7455
7456 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7457 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7458 NewVAddr)
7459 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7460 .addImm(AMDGPU::sub0)
7461 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7462 .addImm(AMDGPU::sub1);
7463 } else {
7464 // Legalize a VGPR Rsrc and soffset together.
7465 if (!isSoffsetLegal) {
7466 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7467 CreatedBB =
7468 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7469 return CreatedBB;
7470 }
7471 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7472 return CreatedBB;
7473 }
7474 }
7475
7476 // Legalize a VGPR soffset.
7477 if (!isSoffsetLegal) {
7478 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7479 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7480 return CreatedBB;
7481 }
7482 return CreatedBB;
7483}
7484
7486 InstrList.insert(MI);
7487 // Add MBUF instructiosn to deferred list.
7488 int RsrcIdx =
7489 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7490 if (RsrcIdx != -1) {
7491 DeferredList.insert(MI);
7492 }
7493}
7494
7496 return DeferredList.contains(MI);
7497}
7498
7499// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7500// lowering (change spgr to vgpr).
7501// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7502// size. Need to legalize the size of the operands during the vgpr lowering
7503// chain. This can be removed after we have sgpr16 in place
7505 MachineRegisterInfo &MRI) const {
7506 if (!ST.useRealTrue16Insts())
7507 return;
7508
7509 unsigned Opcode = MI.getOpcode();
7510 MachineBasicBlock *MBB = MI.getParent();
7511 // Legalize operands and check for size mismatch
7512 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7513 OpIdx >= get(Opcode).getNumOperands() ||
7514 get(Opcode).operands()[OpIdx].RegClass == -1)
7515 return;
7516
7517 MachineOperand &Op = MI.getOperand(OpIdx);
7518 if (!Op.isReg() || !Op.getReg().isVirtual())
7519 return;
7520
7521 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7522 if (!RI.isVGPRClass(CurrRC))
7523 return;
7524
7525 unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
7526 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7527 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7528 Op.setSubReg(AMDGPU::lo16);
7529 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7530 const DebugLoc &DL = MI.getDebugLoc();
7531 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7532 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7533 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7534 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7535 .addReg(Op.getReg())
7536 .addImm(AMDGPU::lo16)
7537 .addReg(Undef)
7538 .addImm(AMDGPU::hi16);
7539 Op.setReg(NewDstReg);
7540 }
7541}
7543 MachineRegisterInfo &MRI) const {
7544 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7546}
7547
7549 MachineDominatorTree *MDT) const {
7550
7551 while (!Worklist.empty()) {
7552 MachineInstr &Inst = *Worklist.top();
7553 Worklist.erase_top();
7554 // Skip MachineInstr in the deferred list.
7555 if (Worklist.isDeferred(&Inst))
7556 continue;
7557 moveToVALUImpl(Worklist, MDT, Inst);
7558 }
7559
7560 // Deferred list of instructions will be processed once
7561 // all the MachineInstr in the worklist are done.
7562 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7563 moveToVALUImpl(Worklist, MDT, *Inst);
7564 assert(Worklist.empty() &&
7565 "Deferred MachineInstr are not supposed to re-populate worklist");
7566 }
7567}
7568
7571 MachineInstr &Inst) const {
7572
7574 if (!MBB)
7575 return;
7577 unsigned Opcode = Inst.getOpcode();
7578 unsigned NewOpcode = getVALUOp(Inst);
7579 // Handle some special cases
7580 switch (Opcode) {
7581 default:
7582 break;
7583 case AMDGPU::S_ADD_I32:
7584 case AMDGPU::S_SUB_I32: {
7585 // FIXME: The u32 versions currently selected use the carry.
7586 bool Changed;
7587 MachineBasicBlock *CreatedBBTmp = nullptr;
7588 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7589 if (Changed)
7590 return;
7591
7592 // Default handling
7593 break;
7594 }
7595
7596 case AMDGPU::S_MUL_U64:
7597 if (ST.hasVectorMulU64()) {
7598 NewOpcode = AMDGPU::V_MUL_U64_e64;
7599 break;
7600 }
7601 // Split s_mul_u64 in 32-bit vector multiplications.
7602 splitScalarSMulU64(Worklist, Inst, MDT);
7603 Inst.eraseFromParent();
7604 return;
7605
7606 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7607 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7608 // This is a special case of s_mul_u64 where all the operands are either
7609 // zero extended or sign extended.
7610 splitScalarSMulPseudo(Worklist, Inst, MDT);
7611 Inst.eraseFromParent();
7612 return;
7613
7614 case AMDGPU::S_AND_B64:
7615 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7616 Inst.eraseFromParent();
7617 return;
7618
7619 case AMDGPU::S_OR_B64:
7620 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7621 Inst.eraseFromParent();
7622 return;
7623
7624 case AMDGPU::S_XOR_B64:
7625 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7626 Inst.eraseFromParent();
7627 return;
7628
7629 case AMDGPU::S_NAND_B64:
7630 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7631 Inst.eraseFromParent();
7632 return;
7633
7634 case AMDGPU::S_NOR_B64:
7635 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7636 Inst.eraseFromParent();
7637 return;
7638
7639 case AMDGPU::S_XNOR_B64:
7640 if (ST.hasDLInsts())
7641 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7642 else
7643 splitScalar64BitXnor(Worklist, Inst, MDT);
7644 Inst.eraseFromParent();
7645 return;
7646
7647 case AMDGPU::S_ANDN2_B64:
7648 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7649 Inst.eraseFromParent();
7650 return;
7651
7652 case AMDGPU::S_ORN2_B64:
7653 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7654 Inst.eraseFromParent();
7655 return;
7656
7657 case AMDGPU::S_BREV_B64:
7658 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7659 Inst.eraseFromParent();
7660 return;
7661
7662 case AMDGPU::S_NOT_B64:
7663 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7664 Inst.eraseFromParent();
7665 return;
7666
7667 case AMDGPU::S_BCNT1_I32_B64:
7668 splitScalar64BitBCNT(Worklist, Inst);
7669 Inst.eraseFromParent();
7670 return;
7671
7672 case AMDGPU::S_BFE_I64:
7673 splitScalar64BitBFE(Worklist, Inst);
7674 Inst.eraseFromParent();
7675 return;
7676
7677 case AMDGPU::S_FLBIT_I32_B64:
7678 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7679 Inst.eraseFromParent();
7680 return;
7681 case AMDGPU::S_FF1_I32_B64:
7682 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7683 Inst.eraseFromParent();
7684 return;
7685
7686 case AMDGPU::S_LSHL_B32:
7687 if (ST.hasOnlyRevVALUShifts()) {
7688 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7689 swapOperands(Inst);
7690 }
7691 break;
7692 case AMDGPU::S_ASHR_I32:
7693 if (ST.hasOnlyRevVALUShifts()) {
7694 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7695 swapOperands(Inst);
7696 }
7697 break;
7698 case AMDGPU::S_LSHR_B32:
7699 if (ST.hasOnlyRevVALUShifts()) {
7700 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7701 swapOperands(Inst);
7702 }
7703 break;
7704 case AMDGPU::S_LSHL_B64:
7705 if (ST.hasOnlyRevVALUShifts()) {
7706 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7707 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7708 : AMDGPU::V_LSHLREV_B64_e64;
7709 swapOperands(Inst);
7710 }
7711 break;
7712 case AMDGPU::S_ASHR_I64:
7713 if (ST.hasOnlyRevVALUShifts()) {
7714 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7715 swapOperands(Inst);
7716 }
7717 break;
7718 case AMDGPU::S_LSHR_B64:
7719 if (ST.hasOnlyRevVALUShifts()) {
7720 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7721 swapOperands(Inst);
7722 }
7723 break;
7724
7725 case AMDGPU::S_ABS_I32:
7726 lowerScalarAbs(Worklist, Inst);
7727 Inst.eraseFromParent();
7728 return;
7729
7730 case AMDGPU::S_CBRANCH_SCC0:
7731 case AMDGPU::S_CBRANCH_SCC1: {
7732 // Clear unused bits of vcc
7733 Register CondReg = Inst.getOperand(1).getReg();
7734 bool IsSCC = CondReg == AMDGPU::SCC;
7735 Register VCC = RI.getVCC();
7736 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7737 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7738 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7739 .addReg(EXEC)
7740 .addReg(IsSCC ? VCC : CondReg);
7741 Inst.removeOperand(1);
7742 } break;
7743
7744 case AMDGPU::S_BFE_U64:
7745 case AMDGPU::S_BFM_B64:
7746 llvm_unreachable("Moving this op to VALU not implemented");
7747
7748 case AMDGPU::S_PACK_LL_B32_B16:
7749 case AMDGPU::S_PACK_LH_B32_B16:
7750 case AMDGPU::S_PACK_HL_B32_B16:
7751 case AMDGPU::S_PACK_HH_B32_B16:
7752 movePackToVALU(Worklist, MRI, Inst);
7753 Inst.eraseFromParent();
7754 return;
7755
7756 case AMDGPU::S_XNOR_B32:
7757 lowerScalarXnor(Worklist, Inst);
7758 Inst.eraseFromParent();
7759 return;
7760
7761 case AMDGPU::S_NAND_B32:
7762 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7763 Inst.eraseFromParent();
7764 return;
7765
7766 case AMDGPU::S_NOR_B32:
7767 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7768 Inst.eraseFromParent();
7769 return;
7770
7771 case AMDGPU::S_ANDN2_B32:
7772 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7773 Inst.eraseFromParent();
7774 return;
7775
7776 case AMDGPU::S_ORN2_B32:
7777 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7778 Inst.eraseFromParent();
7779 return;
7780
7781 // TODO: remove as soon as everything is ready
7782 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7783 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7784 // can only be selected from the uniform SDNode.
7785 case AMDGPU::S_ADD_CO_PSEUDO:
7786 case AMDGPU::S_SUB_CO_PSEUDO: {
7787 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7788 ? AMDGPU::V_ADDC_U32_e64
7789 : AMDGPU::V_SUBB_U32_e64;
7790 const auto *CarryRC = RI.getWaveMaskRegClass();
7791
7792 Register CarryInReg = Inst.getOperand(4).getReg();
7793 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7794 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7795 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7796 .addReg(CarryInReg);
7797 }
7798
7799 Register CarryOutReg = Inst.getOperand(1).getReg();
7800
7801 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7802 MRI.getRegClass(Inst.getOperand(0).getReg())));
7803 MachineInstr *CarryOp =
7804 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7805 .addReg(CarryOutReg, RegState::Define)
7806 .add(Inst.getOperand(2))
7807 .add(Inst.getOperand(3))
7808 .addReg(CarryInReg)
7809 .addImm(0);
7810 legalizeOperands(*CarryOp);
7811 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7812 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7813 Inst.eraseFromParent();
7814 }
7815 return;
7816 case AMDGPU::S_UADDO_PSEUDO:
7817 case AMDGPU::S_USUBO_PSEUDO: {
7818 const DebugLoc &DL = Inst.getDebugLoc();
7819 MachineOperand &Dest0 = Inst.getOperand(0);
7820 MachineOperand &Dest1 = Inst.getOperand(1);
7821 MachineOperand &Src0 = Inst.getOperand(2);
7822 MachineOperand &Src1 = Inst.getOperand(3);
7823
7824 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7825 ? AMDGPU::V_ADD_CO_U32_e64
7826 : AMDGPU::V_SUB_CO_U32_e64;
7827 const TargetRegisterClass *NewRC =
7828 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7829 Register DestReg = MRI.createVirtualRegister(NewRC);
7830 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7831 .addReg(Dest1.getReg(), RegState::Define)
7832 .add(Src0)
7833 .add(Src1)
7834 .addImm(0); // clamp bit
7835
7836 legalizeOperands(*NewInstr, MDT);
7837 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7838 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7839 Worklist);
7840 Inst.eraseFromParent();
7841 }
7842 return;
7843
7844 case AMDGPU::S_CSELECT_B32:
7845 case AMDGPU::S_CSELECT_B64:
7846 lowerSelect(Worklist, Inst, MDT);
7847 Inst.eraseFromParent();
7848 return;
7849 case AMDGPU::S_CMP_EQ_I32:
7850 case AMDGPU::S_CMP_LG_I32:
7851 case AMDGPU::S_CMP_GT_I32:
7852 case AMDGPU::S_CMP_GE_I32:
7853 case AMDGPU::S_CMP_LT_I32:
7854 case AMDGPU::S_CMP_LE_I32:
7855 case AMDGPU::S_CMP_EQ_U32:
7856 case AMDGPU::S_CMP_LG_U32:
7857 case AMDGPU::S_CMP_GT_U32:
7858 case AMDGPU::S_CMP_GE_U32:
7859 case AMDGPU::S_CMP_LT_U32:
7860 case AMDGPU::S_CMP_LE_U32:
7861 case AMDGPU::S_CMP_EQ_U64:
7862 case AMDGPU::S_CMP_LG_U64:
7863 case AMDGPU::S_CMP_LT_F32:
7864 case AMDGPU::S_CMP_EQ_F32:
7865 case AMDGPU::S_CMP_LE_F32:
7866 case AMDGPU::S_CMP_GT_F32:
7867 case AMDGPU::S_CMP_LG_F32:
7868 case AMDGPU::S_CMP_GE_F32:
7869 case AMDGPU::S_CMP_O_F32:
7870 case AMDGPU::S_CMP_U_F32:
7871 case AMDGPU::S_CMP_NGE_F32:
7872 case AMDGPU::S_CMP_NLG_F32:
7873 case AMDGPU::S_CMP_NGT_F32:
7874 case AMDGPU::S_CMP_NLE_F32:
7875 case AMDGPU::S_CMP_NEQ_F32:
7876 case AMDGPU::S_CMP_NLT_F32: {
7877 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7878 auto NewInstr =
7879 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7880 .setMIFlags(Inst.getFlags());
7881 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
7882 0) {
7883 NewInstr
7884 .addImm(0) // src0_modifiers
7885 .add(Inst.getOperand(0)) // src0
7886 .addImm(0) // src1_modifiers
7887 .add(Inst.getOperand(1)) // src1
7888 .addImm(0); // clamp
7889 } else {
7890 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
7891 }
7892 legalizeOperands(*NewInstr, MDT);
7893 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7894 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7895 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7896 Inst.eraseFromParent();
7897 return;
7898 }
7899 case AMDGPU::S_CMP_LT_F16:
7900 case AMDGPU::S_CMP_EQ_F16:
7901 case AMDGPU::S_CMP_LE_F16:
7902 case AMDGPU::S_CMP_GT_F16:
7903 case AMDGPU::S_CMP_LG_F16:
7904 case AMDGPU::S_CMP_GE_F16:
7905 case AMDGPU::S_CMP_O_F16:
7906 case AMDGPU::S_CMP_U_F16:
7907 case AMDGPU::S_CMP_NGE_F16:
7908 case AMDGPU::S_CMP_NLG_F16:
7909 case AMDGPU::S_CMP_NGT_F16:
7910 case AMDGPU::S_CMP_NLE_F16:
7911 case AMDGPU::S_CMP_NEQ_F16:
7912 case AMDGPU::S_CMP_NLT_F16: {
7913 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7914 auto NewInstr =
7915 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7916 .setMIFlags(Inst.getFlags());
7917 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
7918 NewInstr
7919 .addImm(0) // src0_modifiers
7920 .add(Inst.getOperand(0)) // src0
7921 .addImm(0) // src1_modifiers
7922 .add(Inst.getOperand(1)) // src1
7923 .addImm(0); // clamp
7924 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
7925 NewInstr.addImm(0); // op_sel0
7926 } else {
7927 NewInstr
7928 .add(Inst.getOperand(0))
7929 .add(Inst.getOperand(1));
7930 }
7931 legalizeOperandsVALUt16(*NewInstr, MRI);
7932 legalizeOperands(*NewInstr, MDT);
7933 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7934 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7935 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7936 Inst.eraseFromParent();
7937 return;
7938 }
7939 case AMDGPU::S_CVT_HI_F32_F16: {
7940 const DebugLoc &DL = Inst.getDebugLoc();
7941 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7942 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7943 if (ST.useRealTrue16Insts()) {
7944 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
7945 .add(Inst.getOperand(1));
7946 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7947 .addImm(0) // src0_modifiers
7948 .addReg(TmpReg, 0, AMDGPU::hi16)
7949 .addImm(0) // clamp
7950 .addImm(0) // omod
7951 .addImm(0); // op_sel0
7952 } else {
7953 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7954 .addImm(16)
7955 .add(Inst.getOperand(1));
7956 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7957 .addImm(0) // src0_modifiers
7958 .addReg(TmpReg)
7959 .addImm(0) // clamp
7960 .addImm(0); // omod
7961 }
7962
7963 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7964 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7965 Inst.eraseFromParent();
7966 return;
7967 }
7968 case AMDGPU::S_MINIMUM_F32:
7969 case AMDGPU::S_MAXIMUM_F32: {
7970 const DebugLoc &DL = Inst.getDebugLoc();
7971 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7972 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7973 .addImm(0) // src0_modifiers
7974 .add(Inst.getOperand(1))
7975 .addImm(0) // src1_modifiers
7976 .add(Inst.getOperand(2))
7977 .addImm(0) // clamp
7978 .addImm(0); // omod
7979 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7980
7981 legalizeOperands(*NewInstr, MDT);
7982 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7983 Inst.eraseFromParent();
7984 return;
7985 }
7986 case AMDGPU::S_MINIMUM_F16:
7987 case AMDGPU::S_MAXIMUM_F16: {
7988 const DebugLoc &DL = Inst.getDebugLoc();
7989 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
7990 ? &AMDGPU::VGPR_16RegClass
7991 : &AMDGPU::VGPR_32RegClass);
7992 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7993 .addImm(0) // src0_modifiers
7994 .add(Inst.getOperand(1))
7995 .addImm(0) // src1_modifiers
7996 .add(Inst.getOperand(2))
7997 .addImm(0) // clamp
7998 .addImm(0) // omod
7999 .addImm(0); // opsel0
8000 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8001 legalizeOperandsVALUt16(*NewInstr, MRI);
8002 legalizeOperands(*NewInstr, MDT);
8003 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8004 Inst.eraseFromParent();
8005 return;
8006 }
8007 case AMDGPU::V_S_EXP_F16_e64:
8008 case AMDGPU::V_S_LOG_F16_e64:
8009 case AMDGPU::V_S_RCP_F16_e64:
8010 case AMDGPU::V_S_RSQ_F16_e64:
8011 case AMDGPU::V_S_SQRT_F16_e64: {
8012 const DebugLoc &DL = Inst.getDebugLoc();
8013 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8014 ? &AMDGPU::VGPR_16RegClass
8015 : &AMDGPU::VGPR_32RegClass);
8016 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8017 .add(Inst.getOperand(1)) // src0_modifiers
8018 .add(Inst.getOperand(2))
8019 .add(Inst.getOperand(3)) // clamp
8020 .add(Inst.getOperand(4)) // omod
8021 .setMIFlags(Inst.getFlags());
8022 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8023 NewInstr.addImm(0); // opsel0
8024 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8025 legalizeOperandsVALUt16(*NewInstr, MRI);
8026 legalizeOperands(*NewInstr, MDT);
8027 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8028 Inst.eraseFromParent();
8029 return;
8030 }
8031 }
8032
8033 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8034 // We cannot move this instruction to the VALU, so we should try to
8035 // legalize its operands instead.
8036 legalizeOperands(Inst, MDT);
8037 return;
8038 }
8039 // Handle converting generic instructions like COPY-to-SGPR into
8040 // COPY-to-VGPR.
8041 if (NewOpcode == Opcode) {
8042 Register DstReg = Inst.getOperand(0).getReg();
8043 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8044
8045 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8046 // hope for the best.
8047 if (Inst.isCopy() && DstReg.isPhysical() &&
8048 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8049 // TODO: Only works for 32 bit registers.
8050 if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) {
8051 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8052 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
8053 .add(Inst.getOperand(1));
8054 } else {
8055 Register NewDst =
8056 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8057 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8058 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8059 .add(Inst.getOperand(1));
8060 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8061 DstReg)
8062 .addReg(NewDst);
8063 }
8064 Inst.eraseFromParent();
8065 return;
8066 }
8067
8068 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
8069 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
8070 // Instead of creating a copy where src and dst are the same register
8071 // class, we just replace all uses of dst with src. These kinds of
8072 // copies interfere with the heuristics MachineSink uses to decide
8073 // whether or not to split a critical edge. Since the pass assumes
8074 // that copies will end up as machine instructions and not be
8075 // eliminated.
8076 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8077 Register NewDstReg = Inst.getOperand(1).getReg();
8078 MRI.replaceRegWith(DstReg, NewDstReg);
8079 MRI.clearKillFlags(NewDstReg);
8080 Inst.getOperand(0).setReg(DstReg);
8081 Inst.eraseFromParent();
8082 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8083 for (MachineOperand &MO :
8084 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8085 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8086 }
8087 return;
8088 }
8089
8090 // If this is a v2s copy between 16bit and 32bit reg,
8091 // replace vgpr copy to reg_sequence/extract_subreg
8092 // This can be remove after we have sgpr16 in place
8093 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8094 Inst.getOperand(1).getReg().isVirtual() &&
8095 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8096 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8097 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8098 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8099 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8100 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8101 get(AMDGPU::IMPLICIT_DEF), Undef);
8102 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8103 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8104 .addReg(Inst.getOperand(1).getReg())
8105 .addImm(AMDGPU::lo16)
8106 .addReg(Undef)
8107 .addImm(AMDGPU::hi16);
8108 Inst.eraseFromParent();
8109 MRI.replaceRegWith(DstReg, NewDstReg);
8110 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8111 return;
8112 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8113 AMDGPU::lo16)) {
8114 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8115 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8116 MRI.replaceRegWith(DstReg, NewDstReg);
8117 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8118 return;
8119 }
8120 }
8121
8122 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8123 MRI.replaceRegWith(DstReg, NewDstReg);
8124 legalizeOperands(Inst, MDT);
8125 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8126 return;
8127 }
8128
8129 // Use the new VALU Opcode.
8130 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8131 .setMIFlags(Inst.getFlags());
8132 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8133 // Intersperse VOP3 modifiers among the SALU operands.
8134 NewInstr->addOperand(Inst.getOperand(0));
8135 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8136 AMDGPU::OpName::src0_modifiers) >= 0)
8137 NewInstr.addImm(0);
8138 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8139 MachineOperand Src = Inst.getOperand(1);
8140 NewInstr->addOperand(Src);
8141 }
8142
8143 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8144 // We are converting these to a BFE, so we need to add the missing
8145 // operands for the size and offset.
8146 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8147 NewInstr.addImm(0);
8148 NewInstr.addImm(Size);
8149 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8150 // The VALU version adds the second operand to the result, so insert an
8151 // extra 0 operand.
8152 NewInstr.addImm(0);
8153 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8154 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8155 // If we need to move this to VGPRs, we need to unpack the second
8156 // operand back into the 2 separate ones for bit offset and width.
8157 assert(OffsetWidthOp.isImm() &&
8158 "Scalar BFE is only implemented for constant width and offset");
8159 uint32_t Imm = OffsetWidthOp.getImm();
8160
8161 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8162 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8163 NewInstr.addImm(Offset);
8164 NewInstr.addImm(BitWidth);
8165 } else {
8166 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8167 AMDGPU::OpName::src1_modifiers) >= 0)
8168 NewInstr.addImm(0);
8169 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8170 NewInstr->addOperand(Inst.getOperand(2));
8171 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8172 AMDGPU::OpName::src2_modifiers) >= 0)
8173 NewInstr.addImm(0);
8174 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8175 NewInstr->addOperand(Inst.getOperand(3));
8176 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8177 NewInstr.addImm(0);
8178 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8179 NewInstr.addImm(0);
8180 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8181 NewInstr.addImm(0);
8182 }
8183 } else {
8184 // Just copy the SALU operands.
8185 for (const MachineOperand &Op : Inst.explicit_operands())
8186 NewInstr->addOperand(Op);
8187 }
8188
8189 // Remove any references to SCC. Vector instructions can't read from it, and
8190 // We're just about to add the implicit use / defs of VCC, and we don't want
8191 // both.
8192 for (MachineOperand &Op : Inst.implicit_operands()) {
8193 if (Op.getReg() == AMDGPU::SCC) {
8194 // Only propagate through live-def of SCC.
8195 if (Op.isDef() && !Op.isDead())
8196 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8197 if (Op.isUse())
8198 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8199 }
8200 }
8201 Inst.eraseFromParent();
8202 Register NewDstReg;
8203 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8204 Register DstReg = NewInstr->getOperand(0).getReg();
8205 assert(DstReg.isVirtual());
8206 // Update the destination register class.
8207 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8208 assert(NewDstRC);
8209 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8210 MRI.replaceRegWith(DstReg, NewDstReg);
8211 }
8212 fixImplicitOperands(*NewInstr);
8213
8214 legalizeOperandsVALUt16(*NewInstr, MRI);
8215
8216 // Legalize the operands
8217 legalizeOperands(*NewInstr, MDT);
8218 if (NewDstReg)
8219 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8220}
8221
8222// Add/sub require special handling to deal with carry outs.
8223std::pair<bool, MachineBasicBlock *>
8224SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8225 MachineDominatorTree *MDT) const {
8226 if (ST.hasAddNoCarry()) {
8227 // Assume there is no user of scc since we don't select this in that case.
8228 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8229 // is used.
8230
8231 MachineBasicBlock &MBB = *Inst.getParent();
8233
8234 Register OldDstReg = Inst.getOperand(0).getReg();
8235 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8236
8237 unsigned Opc = Inst.getOpcode();
8238 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8239
8240 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8241 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8242
8243 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8244 Inst.removeOperand(3);
8245
8246 Inst.setDesc(get(NewOpc));
8247 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8249 MRI.replaceRegWith(OldDstReg, ResultReg);
8250 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8251
8252 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8253 return std::pair(true, NewBB);
8254 }
8255
8256 return std::pair(false, nullptr);
8257}
8258
8259void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8260 MachineDominatorTree *MDT) const {
8261
8262 MachineBasicBlock &MBB = *Inst.getParent();
8264 MachineBasicBlock::iterator MII = Inst;
8265 DebugLoc DL = Inst.getDebugLoc();
8266
8267 MachineOperand &Dest = Inst.getOperand(0);
8268 MachineOperand &Src0 = Inst.getOperand(1);
8269 MachineOperand &Src1 = Inst.getOperand(2);
8270 MachineOperand &Cond = Inst.getOperand(3);
8271
8272 Register CondReg = Cond.getReg();
8273 bool IsSCC = (CondReg == AMDGPU::SCC);
8274
8275 // If this is a trivial select where the condition is effectively not SCC
8276 // (CondReg is a source of copy to SCC), then the select is semantically
8277 // equivalent to copying CondReg. Hence, there is no need to create
8278 // V_CNDMASK, we can just use that and bail out.
8279 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8280 (Src1.getImm() == 0)) {
8281 MRI.replaceRegWith(Dest.getReg(), CondReg);
8282 return;
8283 }
8284
8285 Register NewCondReg = CondReg;
8286 if (IsSCC) {
8288 NewCondReg = MRI.createVirtualRegister(TC);
8289
8290 // Now look for the closest SCC def if it is a copy
8291 // replacing the CondReg with the COPY source register
8292 bool CopyFound = false;
8293 for (MachineInstr &CandI :
8295 Inst.getParent()->rend())) {
8296 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8297 -1) {
8298 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8299 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8300 .addReg(CandI.getOperand(1).getReg());
8301 CopyFound = true;
8302 }
8303 break;
8304 }
8305 }
8306 if (!CopyFound) {
8307 // SCC def is not a copy
8308 // Insert a trivial select instead of creating a copy, because a copy from
8309 // SCC would semantically mean just copying a single bit, but we may need
8310 // the result to be a vector condition mask that needs preserving.
8311 unsigned Opcode =
8312 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8313 auto NewSelect =
8314 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8315 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8316 }
8317 }
8318
8319 Register NewDestReg = MRI.createVirtualRegister(
8320 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8321 MachineInstr *NewInst;
8322 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8323 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8324 .addImm(0)
8325 .add(Src1) // False
8326 .addImm(0)
8327 .add(Src0) // True
8328 .addReg(NewCondReg);
8329 } else {
8330 NewInst =
8331 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8332 .add(Src1) // False
8333 .add(Src0) // True
8334 .addReg(NewCondReg);
8335 }
8336 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8337 legalizeOperands(*NewInst, MDT);
8338 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8339}
8340
8341void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8342 MachineInstr &Inst) const {
8343 MachineBasicBlock &MBB = *Inst.getParent();
8345 MachineBasicBlock::iterator MII = Inst;
8346 DebugLoc DL = Inst.getDebugLoc();
8347
8348 MachineOperand &Dest = Inst.getOperand(0);
8349 MachineOperand &Src = Inst.getOperand(1);
8350 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8351 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8352
8353 unsigned SubOp = ST.hasAddNoCarry() ?
8354 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8355
8356 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8357 .addImm(0)
8358 .addReg(Src.getReg());
8359
8360 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8361 .addReg(Src.getReg())
8362 .addReg(TmpReg);
8363
8364 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8365 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8366}
8367
8368void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8369 MachineInstr &Inst) const {
8370 MachineBasicBlock &MBB = *Inst.getParent();
8372 MachineBasicBlock::iterator MII = Inst;
8373 const DebugLoc &DL = Inst.getDebugLoc();
8374
8375 MachineOperand &Dest = Inst.getOperand(0);
8376 MachineOperand &Src0 = Inst.getOperand(1);
8377 MachineOperand &Src1 = Inst.getOperand(2);
8378
8379 if (ST.hasDLInsts()) {
8380 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8381 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8382 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8383
8384 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8385 .add(Src0)
8386 .add(Src1);
8387
8388 MRI.replaceRegWith(Dest.getReg(), NewDest);
8389 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8390 } else {
8391 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8392 // invert either source and then perform the XOR. If either source is a
8393 // scalar register, then we can leave the inversion on the scalar unit to
8394 // achieve a better distribution of scalar and vector instructions.
8395 bool Src0IsSGPR = Src0.isReg() &&
8396 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8397 bool Src1IsSGPR = Src1.isReg() &&
8398 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8400 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8401 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8402
8403 // Build a pair of scalar instructions and add them to the work list.
8404 // The next iteration over the work list will lower these to the vector
8405 // unit as necessary.
8406 if (Src0IsSGPR) {
8407 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8408 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8409 .addReg(Temp)
8410 .add(Src1);
8411 } else if (Src1IsSGPR) {
8412 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8413 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8414 .add(Src0)
8415 .addReg(Temp);
8416 } else {
8417 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8418 .add(Src0)
8419 .add(Src1);
8420 MachineInstr *Not =
8421 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8422 Worklist.insert(Not);
8423 }
8424
8425 MRI.replaceRegWith(Dest.getReg(), NewDest);
8426
8427 Worklist.insert(Xor);
8428
8429 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8430 }
8431}
8432
8433void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8434 MachineInstr &Inst,
8435 unsigned Opcode) const {
8436 MachineBasicBlock &MBB = *Inst.getParent();
8438 MachineBasicBlock::iterator MII = Inst;
8439 const DebugLoc &DL = Inst.getDebugLoc();
8440
8441 MachineOperand &Dest = Inst.getOperand(0);
8442 MachineOperand &Src0 = Inst.getOperand(1);
8443 MachineOperand &Src1 = Inst.getOperand(2);
8444
8445 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8446 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8447
8448 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8449 .add(Src0)
8450 .add(Src1);
8451
8452 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8453 .addReg(Interm);
8454
8455 Worklist.insert(&Op);
8456 Worklist.insert(&Not);
8457
8458 MRI.replaceRegWith(Dest.getReg(), NewDest);
8459 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8460}
8461
8462void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8463 MachineInstr &Inst,
8464 unsigned Opcode) const {
8465 MachineBasicBlock &MBB = *Inst.getParent();
8467 MachineBasicBlock::iterator MII = Inst;
8468 const DebugLoc &DL = Inst.getDebugLoc();
8469
8470 MachineOperand &Dest = Inst.getOperand(0);
8471 MachineOperand &Src0 = Inst.getOperand(1);
8472 MachineOperand &Src1 = Inst.getOperand(2);
8473
8474 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8475 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8476
8477 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8478 .add(Src1);
8479
8480 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8481 .add(Src0)
8482 .addReg(Interm);
8483
8484 Worklist.insert(&Not);
8485 Worklist.insert(&Op);
8486
8487 MRI.replaceRegWith(Dest.getReg(), NewDest);
8488 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8489}
8490
8491void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8492 MachineInstr &Inst, unsigned Opcode,
8493 bool Swap) const {
8494 MachineBasicBlock &MBB = *Inst.getParent();
8496
8497 MachineOperand &Dest = Inst.getOperand(0);
8498 MachineOperand &Src0 = Inst.getOperand(1);
8499 DebugLoc DL = Inst.getDebugLoc();
8500
8501 MachineBasicBlock::iterator MII = Inst;
8502
8503 const MCInstrDesc &InstDesc = get(Opcode);
8504 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8505 MRI.getRegClass(Src0.getReg()) :
8506 &AMDGPU::SGPR_32RegClass;
8507
8508 const TargetRegisterClass *Src0SubRC =
8509 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8510
8511 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8512 AMDGPU::sub0, Src0SubRC);
8513
8514 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8515 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8516 const TargetRegisterClass *NewDestSubRC =
8517 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8518
8519 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8520 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8521
8522 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8523 AMDGPU::sub1, Src0SubRC);
8524
8525 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8526 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8527
8528 if (Swap)
8529 std::swap(DestSub0, DestSub1);
8530
8531 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8532 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8533 .addReg(DestSub0)
8534 .addImm(AMDGPU::sub0)
8535 .addReg(DestSub1)
8536 .addImm(AMDGPU::sub1);
8537
8538 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8539
8540 Worklist.insert(&LoHalf);
8541 Worklist.insert(&HiHalf);
8542
8543 // We don't need to legalizeOperands here because for a single operand, src0
8544 // will support any kind of input.
8545
8546 // Move all users of this moved value.
8547 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8548}
8549
8550// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8551// split the s_mul_u64 in 32-bit vector multiplications.
8552void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8553 MachineInstr &Inst,
8554 MachineDominatorTree *MDT) const {
8555 MachineBasicBlock &MBB = *Inst.getParent();
8557
8558 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8559 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8560 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8561
8562 MachineOperand &Dest = Inst.getOperand(0);
8563 MachineOperand &Src0 = Inst.getOperand(1);
8564 MachineOperand &Src1 = Inst.getOperand(2);
8565 const DebugLoc &DL = Inst.getDebugLoc();
8566 MachineBasicBlock::iterator MII = Inst;
8567
8568 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8569 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8570 const TargetRegisterClass *Src0SubRC =
8571 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8572 if (RI.isSGPRClass(Src0SubRC))
8573 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8574 const TargetRegisterClass *Src1SubRC =
8575 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8576 if (RI.isSGPRClass(Src1SubRC))
8577 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8578
8579 // First, we extract the low 32-bit and high 32-bit values from each of the
8580 // operands.
8581 MachineOperand Op0L =
8582 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8583 MachineOperand Op1L =
8584 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8585 MachineOperand Op0H =
8586 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8587 MachineOperand Op1H =
8588 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8589
8590 // The multilication is done as follows:
8591 //
8592 // Op1H Op1L
8593 // * Op0H Op0L
8594 // --------------------
8595 // Op1H*Op0L Op1L*Op0L
8596 // + Op1H*Op0H Op1L*Op0H
8597 // -----------------------------------------
8598 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8599 //
8600 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8601 // value and that would overflow.
8602 // The low 32-bit value is Op1L*Op0L.
8603 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8604
8605 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8606 MachineInstr *Op1L_Op0H =
8607 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8608 .add(Op1L)
8609 .add(Op0H);
8610
8611 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8612 MachineInstr *Op1H_Op0L =
8613 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8614 .add(Op1H)
8615 .add(Op0L);
8616
8617 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8618 MachineInstr *Carry =
8619 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8620 .add(Op1L)
8621 .add(Op0L);
8622
8623 MachineInstr *LoHalf =
8624 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8625 .add(Op1L)
8626 .add(Op0L);
8627
8628 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8629 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8630 .addReg(Op1L_Op0H_Reg)
8631 .addReg(Op1H_Op0L_Reg);
8632
8633 MachineInstr *HiHalf =
8634 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8635 .addReg(AddReg)
8636 .addReg(CarryReg);
8637
8638 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8639 .addReg(DestSub0)
8640 .addImm(AMDGPU::sub0)
8641 .addReg(DestSub1)
8642 .addImm(AMDGPU::sub1);
8643
8644 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8645
8646 // Try to legalize the operands in case we need to swap the order to keep it
8647 // valid.
8648 legalizeOperands(*Op1L_Op0H, MDT);
8649 legalizeOperands(*Op1H_Op0L, MDT);
8650 legalizeOperands(*Carry, MDT);
8651 legalizeOperands(*LoHalf, MDT);
8652 legalizeOperands(*Add, MDT);
8653 legalizeOperands(*HiHalf, MDT);
8654
8655 // Move all users of this moved value.
8656 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8657}
8658
8659// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8660// multiplications.
8661void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8662 MachineInstr &Inst,
8663 MachineDominatorTree *MDT) const {
8664 MachineBasicBlock &MBB = *Inst.getParent();
8666
8667 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8668 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8669 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8670
8671 MachineOperand &Dest = Inst.getOperand(0);
8672 MachineOperand &Src0 = Inst.getOperand(1);
8673 MachineOperand &Src1 = Inst.getOperand(2);
8674 const DebugLoc &DL = Inst.getDebugLoc();
8675 MachineBasicBlock::iterator MII = Inst;
8676
8677 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8678 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8679 const TargetRegisterClass *Src0SubRC =
8680 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8681 if (RI.isSGPRClass(Src0SubRC))
8682 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8683 const TargetRegisterClass *Src1SubRC =
8684 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8685 if (RI.isSGPRClass(Src1SubRC))
8686 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8687
8688 // First, we extract the low 32-bit and high 32-bit values from each of the
8689 // operands.
8690 MachineOperand Op0L =
8691 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8692 MachineOperand Op1L =
8693 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8694
8695 unsigned Opc = Inst.getOpcode();
8696 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8697 ? AMDGPU::V_MUL_HI_U32_e64
8698 : AMDGPU::V_MUL_HI_I32_e64;
8699 MachineInstr *HiHalf =
8700 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8701
8702 MachineInstr *LoHalf =
8703 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8704 .add(Op1L)
8705 .add(Op0L);
8706
8707 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8708 .addReg(DestSub0)
8709 .addImm(AMDGPU::sub0)
8710 .addReg(DestSub1)
8711 .addImm(AMDGPU::sub1);
8712
8713 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8714
8715 // Try to legalize the operands in case we need to swap the order to keep it
8716 // valid.
8717 legalizeOperands(*HiHalf, MDT);
8718 legalizeOperands(*LoHalf, MDT);
8719
8720 // Move all users of this moved value.
8721 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8722}
8723
8724void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8725 MachineInstr &Inst, unsigned Opcode,
8726 MachineDominatorTree *MDT) const {
8727 MachineBasicBlock &MBB = *Inst.getParent();
8729
8730 MachineOperand &Dest = Inst.getOperand(0);
8731 MachineOperand &Src0 = Inst.getOperand(1);
8732 MachineOperand &Src1 = Inst.getOperand(2);
8733 DebugLoc DL = Inst.getDebugLoc();
8734
8735 MachineBasicBlock::iterator MII = Inst;
8736
8737 const MCInstrDesc &InstDesc = get(Opcode);
8738 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8739 MRI.getRegClass(Src0.getReg()) :
8740 &AMDGPU::SGPR_32RegClass;
8741
8742 const TargetRegisterClass *Src0SubRC =
8743 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8744 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8745 MRI.getRegClass(Src1.getReg()) :
8746 &AMDGPU::SGPR_32RegClass;
8747
8748 const TargetRegisterClass *Src1SubRC =
8749 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8750
8751 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8752 AMDGPU::sub0, Src0SubRC);
8753 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8754 AMDGPU::sub0, Src1SubRC);
8755 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8756 AMDGPU::sub1, Src0SubRC);
8757 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8758 AMDGPU::sub1, Src1SubRC);
8759
8760 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8761 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8762 const TargetRegisterClass *NewDestSubRC =
8763 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8764
8765 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8766 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8767 .add(SrcReg0Sub0)
8768 .add(SrcReg1Sub0);
8769
8770 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8771 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8772 .add(SrcReg0Sub1)
8773 .add(SrcReg1Sub1);
8774
8775 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8776 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8777 .addReg(DestSub0)
8778 .addImm(AMDGPU::sub0)
8779 .addReg(DestSub1)
8780 .addImm(AMDGPU::sub1);
8781
8782 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8783
8784 Worklist.insert(&LoHalf);
8785 Worklist.insert(&HiHalf);
8786
8787 // Move all users of this moved value.
8788 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8789}
8790
8791void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8792 MachineInstr &Inst,
8793 MachineDominatorTree *MDT) const {
8794 MachineBasicBlock &MBB = *Inst.getParent();
8796
8797 MachineOperand &Dest = Inst.getOperand(0);
8798 MachineOperand &Src0 = Inst.getOperand(1);
8799 MachineOperand &Src1 = Inst.getOperand(2);
8800 const DebugLoc &DL = Inst.getDebugLoc();
8801
8802 MachineBasicBlock::iterator MII = Inst;
8803
8804 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8805
8806 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8807
8808 MachineOperand* Op0;
8809 MachineOperand* Op1;
8810
8811 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8812 Op0 = &Src0;
8813 Op1 = &Src1;
8814 } else {
8815 Op0 = &Src1;
8816 Op1 = &Src0;
8817 }
8818
8819 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8820 .add(*Op0);
8821
8822 Register NewDest = MRI.createVirtualRegister(DestRC);
8823
8824 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8825 .addReg(Interm)
8826 .add(*Op1);
8827
8828 MRI.replaceRegWith(Dest.getReg(), NewDest);
8829
8830 Worklist.insert(&Xor);
8831}
8832
8833void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8834 MachineInstr &Inst) const {
8835 MachineBasicBlock &MBB = *Inst.getParent();
8837
8838 MachineBasicBlock::iterator MII = Inst;
8839 const DebugLoc &DL = Inst.getDebugLoc();
8840
8841 MachineOperand &Dest = Inst.getOperand(0);
8842 MachineOperand &Src = Inst.getOperand(1);
8843
8844 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8845 const TargetRegisterClass *SrcRC = Src.isReg() ?
8846 MRI.getRegClass(Src.getReg()) :
8847 &AMDGPU::SGPR_32RegClass;
8848
8849 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8850 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8851
8852 const TargetRegisterClass *SrcSubRC =
8853 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8854
8855 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8856 AMDGPU::sub0, SrcSubRC);
8857 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8858 AMDGPU::sub1, SrcSubRC);
8859
8860 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8861
8862 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8863
8864 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8865
8866 // We don't need to legalize operands here. src0 for either instruction can be
8867 // an SGPR, and the second input is unused or determined here.
8868 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8869}
8870
8871void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8872 MachineInstr &Inst) const {
8873 MachineBasicBlock &MBB = *Inst.getParent();
8875 MachineBasicBlock::iterator MII = Inst;
8876 const DebugLoc &DL = Inst.getDebugLoc();
8877
8878 MachineOperand &Dest = Inst.getOperand(0);
8879 uint32_t Imm = Inst.getOperand(2).getImm();
8880 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8881 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8882
8883 (void) Offset;
8884
8885 // Only sext_inreg cases handled.
8886 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8887 Offset == 0 && "Not implemented");
8888
8889 if (BitWidth < 32) {
8890 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8891 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8892 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8893
8894 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8895 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8896 .addImm(0)
8897 .addImm(BitWidth);
8898
8899 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8900 .addImm(31)
8901 .addReg(MidRegLo);
8902
8903 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8904 .addReg(MidRegLo)
8905 .addImm(AMDGPU::sub0)
8906 .addReg(MidRegHi)
8907 .addImm(AMDGPU::sub1);
8908
8909 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8910 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8911 return;
8912 }
8913
8914 MachineOperand &Src = Inst.getOperand(1);
8915 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8916 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8917
8918 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8919 .addImm(31)
8920 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8921
8922 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8923 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8924 .addImm(AMDGPU::sub0)
8925 .addReg(TmpReg)
8926 .addImm(AMDGPU::sub1);
8927
8928 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8929 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8930}
8931
8932void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8933 MachineInstr &Inst, unsigned Opcode,
8934 MachineDominatorTree *MDT) const {
8935 // (S_FLBIT_I32_B64 hi:lo) ->
8936 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8937 // (S_FF1_I32_B64 hi:lo) ->
8938 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8939
8940 MachineBasicBlock &MBB = *Inst.getParent();
8942 MachineBasicBlock::iterator MII = Inst;
8943 const DebugLoc &DL = Inst.getDebugLoc();
8944
8945 MachineOperand &Dest = Inst.getOperand(0);
8946 MachineOperand &Src = Inst.getOperand(1);
8947
8948 const MCInstrDesc &InstDesc = get(Opcode);
8949
8950 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8951 unsigned OpcodeAdd =
8952 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8953
8954 const TargetRegisterClass *SrcRC =
8955 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8956 const TargetRegisterClass *SrcSubRC =
8957 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8958
8959 MachineOperand SrcRegSub0 =
8960 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8961 MachineOperand SrcRegSub1 =
8962 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8963
8964 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8965 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8966 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8967 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8968
8969 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8970
8971 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8972
8973 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8974 .addReg(IsCtlz ? MidReg1 : MidReg2)
8975 .addImm(32)
8976 .addImm(1); // enable clamp
8977
8978 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8979 .addReg(MidReg3)
8980 .addReg(IsCtlz ? MidReg2 : MidReg1);
8981
8982 MRI.replaceRegWith(Dest.getReg(), MidReg4);
8983
8984 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8985}
8986
8987void SIInstrInfo::addUsersToMoveToVALUWorklist(
8989 SIInstrWorklist &Worklist) const {
8990 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
8991 MachineInstr &UseMI = *MO.getParent();
8992
8993 unsigned OpNo = 0;
8994
8995 switch (UseMI.getOpcode()) {
8996 case AMDGPU::COPY:
8997 case AMDGPU::WQM:
8998 case AMDGPU::SOFT_WQM:
8999 case AMDGPU::STRICT_WWM:
9000 case AMDGPU::STRICT_WQM:
9001 case AMDGPU::REG_SEQUENCE:
9002 case AMDGPU::PHI:
9003 case AMDGPU::INSERT_SUBREG:
9004 break;
9005 default:
9006 OpNo = MO.getOperandNo();
9007 break;
9008 }
9009
9010 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo)))
9011 Worklist.insert(&UseMI);
9012 else
9013 // Legalization could change user list.
9015 }
9016}
9017
9018void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9020 MachineInstr &Inst) const {
9021 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9023 MachineOperand &Src0 = Inst.getOperand(1);
9024 MachineOperand &Src1 = Inst.getOperand(2);
9025 const DebugLoc &DL = Inst.getDebugLoc();
9026
9027 switch (Inst.getOpcode()) {
9028 case AMDGPU::S_PACK_LL_B32_B16: {
9029 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9030 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9031
9032 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9033 // 0.
9034 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9035 .addImm(0xffff);
9036
9037 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9038 .addReg(ImmReg, RegState::Kill)
9039 .add(Src0);
9040
9041 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9042 .add(Src1)
9043 .addImm(16)
9044 .addReg(TmpReg, RegState::Kill);
9045 break;
9046 }
9047 case AMDGPU::S_PACK_LH_B32_B16: {
9048 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9049 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9050 .addImm(0xffff);
9051 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9052 .addReg(ImmReg, RegState::Kill)
9053 .add(Src0)
9054 .add(Src1);
9055 break;
9056 }
9057 case AMDGPU::S_PACK_HL_B32_B16: {
9058 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9059 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9060 .addImm(16)
9061 .add(Src0);
9062 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9063 .add(Src1)
9064 .addImm(16)
9065 .addReg(TmpReg, RegState::Kill);
9066 break;
9067 }
9068 case AMDGPU::S_PACK_HH_B32_B16: {
9069 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9070 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9071 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9072 .addImm(16)
9073 .add(Src0);
9074 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9075 .addImm(0xffff0000);
9076 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9077 .add(Src1)
9078 .addReg(ImmReg, RegState::Kill)
9079 .addReg(TmpReg, RegState::Kill);
9080 break;
9081 }
9082 default:
9083 llvm_unreachable("unhandled s_pack_* instruction");
9084 }
9085
9086 MachineOperand &Dest = Inst.getOperand(0);
9087 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9088 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9089}
9090
9091void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
9092 MachineInstr &SCCDefInst,
9093 SIInstrWorklist &Worklist,
9094 Register NewCond) const {
9095
9096 // Ensure that def inst defines SCC, which is still live.
9097 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9098 !Op.isDead() && Op.getParent() == &SCCDefInst);
9099 SmallVector<MachineInstr *, 4> CopyToDelete;
9100 // This assumes that all the users of SCC are in the same block
9101 // as the SCC def.
9102 for (MachineInstr &MI : // Skip the def inst itself.
9103 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9104 SCCDefInst.getParent()->end())) {
9105 // Check if SCC is used first.
9106 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9107 if (SCCIdx != -1) {
9108 if (MI.isCopy()) {
9109 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9110 Register DestReg = MI.getOperand(0).getReg();
9111
9112 MRI.replaceRegWith(DestReg, NewCond);
9113 CopyToDelete.push_back(&MI);
9114 } else {
9115
9116 if (NewCond.isValid())
9117 MI.getOperand(SCCIdx).setReg(NewCond);
9118
9119 Worklist.insert(&MI);
9120 }
9121 }
9122 // Exit if we find another SCC def.
9123 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9124 break;
9125 }
9126 for (auto &Copy : CopyToDelete)
9127 Copy->eraseFromParent();
9128}
9129
9130// Instructions that use SCC may be converted to VALU instructions. When that
9131// happens, the SCC register is changed to VCC_LO. The instruction that defines
9132// SCC must be changed to an instruction that defines VCC. This function makes
9133// sure that the instruction that defines SCC is added to the moveToVALU
9134// worklist.
9135void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9136 SIInstrWorklist &Worklist) const {
9137 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9138 // then there is nothing to do because the defining instruction has been
9139 // converted to a VALU already. If SCC then that instruction needs to be
9140 // converted to a VALU.
9141 for (MachineInstr &MI :
9142 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9143 SCCUseInst->getParent()->rend())) {
9144 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9145 break;
9146 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9147 Worklist.insert(&MI);
9148 break;
9149 }
9150 }
9151}
9152
9153const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9154 const MachineInstr &Inst) const {
9155 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9156
9157 switch (Inst.getOpcode()) {
9158 // For target instructions, getOpRegClass just returns the virtual register
9159 // class associated with the operand, so we need to find an equivalent VGPR
9160 // register class in order to move the instruction to the VALU.
9161 case AMDGPU::COPY:
9162 case AMDGPU::PHI:
9163 case AMDGPU::REG_SEQUENCE:
9164 case AMDGPU::INSERT_SUBREG:
9165 case AMDGPU::WQM:
9166 case AMDGPU::SOFT_WQM:
9167 case AMDGPU::STRICT_WWM:
9168 case AMDGPU::STRICT_WQM: {
9169 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9170 if (RI.isAGPRClass(SrcRC)) {
9171 if (RI.isAGPRClass(NewDstRC))
9172 return nullptr;
9173
9174 switch (Inst.getOpcode()) {
9175 case AMDGPU::PHI:
9176 case AMDGPU::REG_SEQUENCE:
9177 case AMDGPU::INSERT_SUBREG:
9178 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9179 break;
9180 default:
9181 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9182 }
9183
9184 if (!NewDstRC)
9185 return nullptr;
9186 } else {
9187 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9188 return nullptr;
9189
9190 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9191 if (!NewDstRC)
9192 return nullptr;
9193 }
9194
9195 return NewDstRC;
9196 }
9197 default:
9198 return NewDstRC;
9199 }
9200}
9201
9202// Find the one SGPR operand we are allowed to use.
9203Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9204 int OpIndices[3]) const {
9205 const MCInstrDesc &Desc = MI.getDesc();
9206
9207 // Find the one SGPR operand we are allowed to use.
9208 //
9209 // First we need to consider the instruction's operand requirements before
9210 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9211 // of VCC, but we are still bound by the constant bus requirement to only use
9212 // one.
9213 //
9214 // If the operand's class is an SGPR, we can never move it.
9215
9216 Register SGPRReg = findImplicitSGPRRead(MI);
9217 if (SGPRReg)
9218 return SGPRReg;
9219
9220 Register UsedSGPRs[3] = {Register()};
9221 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9222
9223 for (unsigned i = 0; i < 3; ++i) {
9224 int Idx = OpIndices[i];
9225 if (Idx == -1)
9226 break;
9227
9228 const MachineOperand &MO = MI.getOperand(Idx);
9229 if (!MO.isReg())
9230 continue;
9231
9232 // Is this operand statically required to be an SGPR based on the operand
9233 // constraints?
9234 const TargetRegisterClass *OpRC =
9235 RI.getRegClass(Desc.operands()[Idx].RegClass);
9236 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9237 if (IsRequiredSGPR)
9238 return MO.getReg();
9239
9240 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9241 Register Reg = MO.getReg();
9242 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9243 if (RI.isSGPRClass(RegRC))
9244 UsedSGPRs[i] = Reg;
9245 }
9246
9247 // We don't have a required SGPR operand, so we have a bit more freedom in
9248 // selecting operands to move.
9249
9250 // Try to select the most used SGPR. If an SGPR is equal to one of the
9251 // others, we choose that.
9252 //
9253 // e.g.
9254 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9255 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9256
9257 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9258 // prefer those.
9259
9260 if (UsedSGPRs[0]) {
9261 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9262 SGPRReg = UsedSGPRs[0];
9263 }
9264
9265 if (!SGPRReg && UsedSGPRs[1]) {
9266 if (UsedSGPRs[1] == UsedSGPRs[2])
9267 SGPRReg = UsedSGPRs[1];
9268 }
9269
9270 return SGPRReg;
9271}
9272
9274 AMDGPU::OpName OperandName) const {
9275 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9276 if (Idx == -1)
9277 return nullptr;
9278
9279 return &MI.getOperand(Idx);
9280}
9281
9287 return (Format << 44) |
9288 (1ULL << 56) | // RESOURCE_LEVEL = 1
9289 (3ULL << 60); // OOB_SELECT = 3
9290 }
9291
9292 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9293 if (ST.isAmdHsaOS()) {
9294 // Set ATC = 1. GFX9 doesn't have this bit.
9296 RsrcDataFormat |= (1ULL << 56);
9297
9298 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9299 // BTW, it disables TC L2 and therefore decreases performance.
9301 RsrcDataFormat |= (2ULL << 59);
9302 }
9303
9304 return RsrcDataFormat;
9305}
9306
9310 0xffffffff; // Size;
9311
9312 // GFX9 doesn't have ELEMENT_SIZE.
9314 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9315 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9316 }
9317
9318 // IndexStride = 64 / 32.
9319 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9320 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9321
9322 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9323 // Clear them unless we want a huge stride.
9326 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9327
9328 return Rsrc23;
9329}
9330
9332 unsigned Opc = MI.getOpcode();
9333
9334 return isSMRD(Opc);
9335}
9336
9338 return get(Opc).mayLoad() &&
9339 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9340}
9341
9343 int &FrameIndex) const {
9344 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9345 if (!Addr || !Addr->isFI())
9346 return Register();
9347
9348 assert(!MI.memoperands_empty() &&
9349 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9350
9351 FrameIndex = Addr->getIndex();
9352 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9353}
9354
9356 int &FrameIndex) const {
9357 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9358 assert(Addr && Addr->isFI());
9359 FrameIndex = Addr->getIndex();
9360 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9361}
9362
9364 int &FrameIndex) const {
9365 if (!MI.mayLoad())
9366 return Register();
9367
9368 if (isMUBUF(MI) || isVGPRSpill(MI))
9369 return isStackAccess(MI, FrameIndex);
9370
9371 if (isSGPRSpill(MI))
9372 return isSGPRStackAccess(MI, FrameIndex);
9373
9374 return Register();
9375}
9376
9378 int &FrameIndex) const {
9379 if (!MI.mayStore())
9380 return Register();
9381
9382 if (isMUBUF(MI) || isVGPRSpill(MI))
9383 return isStackAccess(MI, FrameIndex);
9384
9385 if (isSGPRSpill(MI))
9386 return isSGPRStackAccess(MI, FrameIndex);
9387
9388 return Register();
9389}
9390
9392 unsigned Size = 0;
9394 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9395 while (++I != E && I->isInsideBundle()) {
9396 assert(!I->isBundle() && "No nested bundle!");
9398 }
9399
9400 return Size;
9401}
9402
9404 unsigned Opc = MI.getOpcode();
9406 unsigned DescSize = Desc.getSize();
9407
9408 // If we have a definitive size, we can use it. Otherwise we need to inspect
9409 // the operands to know the size.
9410 if (isFixedSize(MI)) {
9411 unsigned Size = DescSize;
9412
9413 // If we hit the buggy offset, an extra nop will be inserted in MC so
9414 // estimate the worst case.
9415 if (MI.isBranch() && ST.hasOffset3fBug())
9416 Size += 4;
9417
9418 return Size;
9419 }
9420
9421 // Instructions may have a 32-bit literal encoded after them. Check
9422 // operands that could ever be literals.
9423 if (isVALU(MI) || isSALU(MI)) {
9424 if (isDPP(MI))
9425 return DescSize;
9426 bool HasLiteral = false;
9427 unsigned LiteralSize = 4;
9428 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9429 const MachineOperand &Op = MI.getOperand(I);
9430 const MCOperandInfo &OpInfo = Desc.operands()[I];
9431 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9432 HasLiteral = true;
9433 if (ST.has64BitLiterals()) {
9434 switch (OpInfo.OperandType) {
9435 default:
9436 break;
9438 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9439 LiteralSize = 8;
9440 break;
9442 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9443 LiteralSize = 8;
9444 break;
9445 }
9446 }
9447 break;
9448 }
9449 }
9450 return HasLiteral ? DescSize + LiteralSize : DescSize;
9451 }
9452
9453 // Check whether we have extra NSA words.
9454 if (isMIMG(MI)) {
9455 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9456 if (VAddr0Idx < 0)
9457 return 8;
9458
9459 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9460 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9461 }
9462
9463 switch (Opc) {
9464 case TargetOpcode::BUNDLE:
9465 return getInstBundleSize(MI);
9466 case TargetOpcode::INLINEASM:
9467 case TargetOpcode::INLINEASM_BR: {
9468 const MachineFunction *MF = MI.getParent()->getParent();
9469 const char *AsmStr = MI.getOperand(0).getSymbolName();
9470 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9471 }
9472 default:
9473 if (MI.isMetaInstruction())
9474 return 0;
9475
9476 // If D16 Pseudo inst, get correct MC code size
9477 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9478 if (D16Info) {
9479 // Assume d16_lo/hi inst are always in same size
9480 unsigned LoInstOpcode = D16Info->LoOp;
9481 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9482 DescSize = Desc.getSize();
9483 }
9484
9485 return DescSize;
9486 }
9487}
9488
9490 if (!isFLAT(MI))
9491 return false;
9492
9493 if (MI.memoperands_empty())
9494 return true;
9495
9496 for (const MachineMemOperand *MMO : MI.memoperands()) {
9497 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9498 return true;
9499 }
9500 return false;
9501}
9502
9505 static const std::pair<int, const char *> TargetIndices[] = {
9506 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9507 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9508 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9509 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9510 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9511 return ArrayRef(TargetIndices);
9512}
9513
9514/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9515/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9518 const ScheduleDAG *DAG) const {
9519 return new GCNHazardRecognizer(DAG->MF);
9520}
9521
9522/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9523/// pass.
9526 return new GCNHazardRecognizer(MF);
9527}
9528
9529// Called during:
9530// - pre-RA scheduling and post-RA scheduling
9533 const ScheduleDAGMI *DAG) const {
9534 // Borrowed from Arm Target
9535 // We would like to restrict this hazard recognizer to only
9536 // post-RA scheduling; we can tell that we're post-RA because we don't
9537 // track VRegLiveness.
9538 if (!DAG->hasVRegLiveness())
9539 return new GCNHazardRecognizer(DAG->MF);
9541}
9542
9543std::pair<unsigned, unsigned>
9545 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9546}
9547
9550 static const std::pair<unsigned, const char *> TargetFlags[] = {
9551 {MO_GOTPCREL, "amdgpu-gotprel"},
9552 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9553 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9554 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9555 {MO_REL32_LO, "amdgpu-rel32-lo"},
9556 {MO_REL32_HI, "amdgpu-rel32-hi"},
9557 {MO_REL64, "amdgpu-rel64"},
9558 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9559 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9560 {MO_ABS64, "amdgpu-abs64"},
9561 };
9562
9563 return ArrayRef(TargetFlags);
9564}
9565
9568 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9569 {
9570 {MONoClobber, "amdgpu-noclobber"},
9571 {MOLastUse, "amdgpu-last-use"},
9572 };
9573
9574 return ArrayRef(TargetFlags);
9575}
9576
9578 const MachineFunction &MF) const {
9580 assert(SrcReg.isVirtual());
9581 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9582 return AMDGPU::WWM_COPY;
9583
9584 return AMDGPU::COPY;
9585}
9586
9588 Register Reg) const {
9589 // We need to handle instructions which may be inserted during register
9590 // allocation to handle the prolog. The initial prolog instruction may have
9591 // been separated from the start of the block by spills and copies inserted
9592 // needed by the prolog. However, the insertions for scalar registers can
9593 // always be placed at the BB top as they are independent of the exec mask
9594 // value.
9595 const MachineFunction *MF = MI.getParent()->getParent();
9596 bool IsNullOrVectorRegister = true;
9597 if (Reg) {
9598 const MachineRegisterInfo &MRI = MF->getRegInfo();
9599 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9600 }
9601
9602 uint16_t Opcode = MI.getOpcode();
9604 return IsNullOrVectorRegister &&
9605 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9606 (Opcode == AMDGPU::IMPLICIT_DEF &&
9607 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
9608 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9609 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9610}
9611
9615 const DebugLoc &DL,
9616 Register DestReg) const {
9617 if (ST.hasAddNoCarry())
9618 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9619
9621 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9622 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9623
9624 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9625 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9626}
9627
9630 const DebugLoc &DL,
9631 Register DestReg,
9632 RegScavenger &RS) const {
9633 if (ST.hasAddNoCarry())
9634 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9635
9636 // If available, prefer to use vcc.
9637 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9638 ? Register(RI.getVCC())
9640 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9641 0, /* AllowSpill */ false);
9642
9643 // TODO: Users need to deal with this.
9644 if (!UnusedCarry.isValid())
9645 return MachineInstrBuilder();
9646
9647 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9648 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9649}
9650
9651bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9652 switch (Opcode) {
9653 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9654 case AMDGPU::SI_KILL_I1_TERMINATOR:
9655 return true;
9656 default:
9657 return false;
9658 }
9659}
9660
9662 switch (Opcode) {
9663 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9664 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9665 case AMDGPU::SI_KILL_I1_PSEUDO:
9666 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9667 default:
9668 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9669 }
9670}
9671
9672bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9673 return Imm <= getMaxMUBUFImmOffset(ST);
9674}
9675
9677 // GFX12 field is non-negative 24-bit signed byte offset.
9678 const unsigned OffsetBits =
9679 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9680 return (1 << OffsetBits) - 1;
9681}
9682
9684 if (!ST.isWave32())
9685 return;
9686
9687 if (MI.isInlineAsm())
9688 return;
9689
9690 for (auto &Op : MI.implicit_operands()) {
9691 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9692 Op.setReg(AMDGPU::VCC_LO);
9693 }
9694}
9695
9697 if (!isSMRD(MI))
9698 return false;
9699
9700 // Check that it is using a buffer resource.
9701 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9702 if (Idx == -1) // e.g. s_memtime
9703 return false;
9704
9705 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9706 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9707}
9708
9709// Given Imm, split it into the values to put into the SOffset and ImmOffset
9710// fields in an MUBUF instruction. Return false if it is not possible (due to a
9711// hardware bug needing a workaround).
9712//
9713// The required alignment ensures that individual address components remain
9714// aligned if they are aligned to begin with. It also ensures that additional
9715// offsets within the given alignment can be added to the resulting ImmOffset.
9717 uint32_t &ImmOffset, Align Alignment) const {
9718 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9719 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9720 uint32_t Overflow = 0;
9721
9722 if (Imm > MaxImm) {
9723 if (Imm <= MaxImm + 64) {
9724 // Use an SOffset inline constant for 4..64
9725 Overflow = Imm - MaxImm;
9726 Imm = MaxImm;
9727 } else {
9728 // Try to keep the same value in SOffset for adjacent loads, so that
9729 // the corresponding register contents can be re-used.
9730 //
9731 // Load values with all low-bits (except for alignment bits) set into
9732 // SOffset, so that a larger range of values can be covered using
9733 // s_movk_i32.
9734 //
9735 // Atomic operations fail to work correctly when individual address
9736 // components are unaligned, even if their sum is aligned.
9737 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9738 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9739 Imm = Low;
9740 Overflow = High - Alignment.value();
9741 }
9742 }
9743
9744 if (Overflow > 0) {
9745 // There is a hardware bug in SI and CI which prevents address clamping in
9746 // MUBUF instructions from working correctly with SOffsets. The immediate
9747 // offset is unaffected.
9749 return false;
9750
9751 // It is not possible to set immediate in SOffset field on some targets.
9752 if (ST.hasRestrictedSOffset())
9753 return false;
9754 }
9755
9756 ImmOffset = Imm;
9757 SOffset = Overflow;
9758 return true;
9759}
9760
9761// Depending on the used address space and instructions, some immediate offsets
9762// are allowed and some are not.
9763// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9764// scratch instruction offsets can also be negative. On GFX12, offsets can be
9765// negative for all variants.
9766//
9767// There are several bugs related to these offsets:
9768// On gfx10.1, flat instructions that go into the global address space cannot
9769// use an offset.
9770//
9771// For scratch instructions, the address can be either an SGPR or a VGPR.
9772// The following offsets can be used, depending on the architecture (x means
9773// cannot be used):
9774// +----------------------------+------+------+
9775// | Address-Mode | SGPR | VGPR |
9776// +----------------------------+------+------+
9777// | gfx9 | | |
9778// | negative, 4-aligned offset | x | ok |
9779// | negative, unaligned offset | x | ok |
9780// +----------------------------+------+------+
9781// | gfx10 | | |
9782// | negative, 4-aligned offset | ok | ok |
9783// | negative, unaligned offset | ok | x |
9784// +----------------------------+------+------+
9785// | gfx10.3 | | |
9786// | negative, 4-aligned offset | ok | ok |
9787// | negative, unaligned offset | ok | ok |
9788// +----------------------------+------+------+
9789//
9790// This function ignores the addressing mode, so if an offset cannot be used in
9791// one addressing mode, it is considered illegal.
9792bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9793 uint64_t FlatVariant) const {
9794 // TODO: Should 0 be special cased?
9795 if (!ST.hasFlatInstOffsets())
9796 return false;
9797
9798 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9799 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9800 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9801 return false;
9802
9804 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9805 (Offset % 4) != 0) {
9806 return false;
9807 }
9808
9809 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9810 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9811 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9812}
9813
9814// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9815std::pair<int64_t, int64_t>
9816SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9817 uint64_t FlatVariant) const {
9818 int64_t RemainderOffset = COffsetVal;
9819 int64_t ImmField = 0;
9820
9821 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9822 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9823
9824 if (AllowNegative) {
9825 // Use signed division by a power of two to truncate towards 0.
9826 int64_t D = 1LL << NumBits;
9827 RemainderOffset = (COffsetVal / D) * D;
9828 ImmField = COffsetVal - RemainderOffset;
9829
9831 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9832 (ImmField % 4) != 0) {
9833 // Make ImmField a multiple of 4
9834 RemainderOffset += ImmField % 4;
9835 ImmField -= ImmField % 4;
9836 }
9837 } else if (COffsetVal >= 0) {
9838 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9839 RemainderOffset = COffsetVal - ImmField;
9840 }
9841
9842 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9843 assert(RemainderOffset + ImmField == COffsetVal);
9844 return {ImmField, RemainderOffset};
9845}
9846
9848 if (ST.hasNegativeScratchOffsetBug() &&
9849 FlatVariant == SIInstrFlags::FlatScratch)
9850 return false;
9851
9852 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9853}
9854
9855static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9856 switch (ST.getGeneration()) {
9857 default:
9858 break;
9861 return SIEncodingFamily::SI;
9864 return SIEncodingFamily::VI;
9870 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
9872 }
9873 llvm_unreachable("Unknown subtarget generation!");
9874}
9875
9876bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9877 switch(MCOp) {
9878 // These opcodes use indirect register addressing so
9879 // they need special handling by codegen (currently missing).
9880 // Therefore it is too risky to allow these opcodes
9881 // to be selected by dpp combiner or sdwa peepholer.
9882 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9883 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9884 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9885 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9886 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9887 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9888 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9889 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9890 return true;
9891 default:
9892 return false;
9893 }
9894}
9895
9896#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
9897 case OPCODE##_dpp: \
9898 case OPCODE##_e32: \
9899 case OPCODE##_e64: \
9900 case OPCODE##_e64_dpp: \
9901 case OPCODE##_sdwa:
9902
9903static bool isRenamedInGFX9(int Opcode) {
9904 switch (Opcode) {
9905 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
9906 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
9907 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
9908 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
9909 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
9910 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
9911 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
9912 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
9913 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
9914 //
9915 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
9916 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
9917 case AMDGPU::V_FMA_F16_gfx9_e64:
9918 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
9919 case AMDGPU::V_INTERP_P2_F16:
9920 case AMDGPU::V_MAD_F16_e64:
9921 case AMDGPU::V_MAD_U16_e64:
9922 case AMDGPU::V_MAD_I16_e64:
9923 return true;
9924 default:
9925 return false;
9926 }
9927}
9928
9929int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9930 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9931
9932 unsigned Gen = subtargetEncodingFamily(ST);
9933
9936
9937 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9938 // subtarget has UnpackedD16VMem feature.
9939 // TODO: remove this when we discard GFX80 encoding.
9940 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9942
9943 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9944 switch (ST.getGeneration()) {
9945 default:
9947 break;
9950 break;
9953 break;
9954 }
9955 }
9956
9957 if (isMAI(Opcode)) {
9958 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9959 if (MFMAOp != -1)
9960 Opcode = MFMAOp;
9961 }
9962
9963 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9964
9965 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
9967
9968 // -1 means that Opcode is already a native instruction.
9969 if (MCOp == -1)
9970 return Opcode;
9971
9972 if (ST.hasGFX90AInsts()) {
9973 uint16_t NMCOp = (uint16_t)-1;
9974 if (ST.hasGFX940Insts())
9976 if (NMCOp == (uint16_t)-1)
9978 if (NMCOp == (uint16_t)-1)
9980 if (NMCOp != (uint16_t)-1)
9981 MCOp = NMCOp;
9982 }
9983
9984 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9985 // no encoding in the given subtarget generation.
9986 if (MCOp == (uint16_t)-1)
9987 return -1;
9988
9989 if (isAsmOnlyOpcode(MCOp))
9990 return -1;
9991
9992 return MCOp;
9993}
9994
9995static
9997 assert(RegOpnd.isReg());
9998 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9999 getRegSubRegPair(RegOpnd);
10000}
10001
10004 assert(MI.isRegSequence());
10005 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10006 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10007 auto &RegOp = MI.getOperand(1 + 2 * I);
10008 return getRegOrUndef(RegOp);
10009 }
10011}
10012
10013// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10014// Following a subreg of reg:subreg isn't supported
10017 if (!RSR.SubReg)
10018 return false;
10019 switch (MI.getOpcode()) {
10020 default: break;
10021 case AMDGPU::REG_SEQUENCE:
10022 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10023 return true;
10024 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10025 case AMDGPU::INSERT_SUBREG:
10026 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10027 // inserted the subreg we're looking for
10028 RSR = getRegOrUndef(MI.getOperand(2));
10029 else { // the subreg in the rest of the reg
10030 auto R1 = getRegOrUndef(MI.getOperand(1));
10031 if (R1.SubReg) // subreg of subreg isn't supported
10032 return false;
10033 RSR.Reg = R1.Reg;
10034 }
10035 return true;
10036 }
10037 return false;
10038}
10039
10042 assert(MRI.isSSA());
10043 if (!P.Reg.isVirtual())
10044 return nullptr;
10045
10046 auto RSR = P;
10047 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10048 while (auto *MI = DefInst) {
10049 DefInst = nullptr;
10050 switch (MI->getOpcode()) {
10051 case AMDGPU::COPY:
10052 case AMDGPU::V_MOV_B32_e32: {
10053 auto &Op1 = MI->getOperand(1);
10054 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10055 if (Op1.isUndef())
10056 return nullptr;
10057 RSR = getRegSubRegPair(Op1);
10058 DefInst = MRI.getVRegDef(RSR.Reg);
10059 }
10060 break;
10061 }
10062 default:
10063 if (followSubRegDef(*MI, RSR)) {
10064 if (!RSR.Reg)
10065 return nullptr;
10066 DefInst = MRI.getVRegDef(RSR.Reg);
10067 }
10068 }
10069 if (!DefInst)
10070 return MI;
10071 }
10072 return nullptr;
10073}
10074
10076 Register VReg,
10077 const MachineInstr &DefMI,
10078 const MachineInstr &UseMI) {
10079 assert(MRI.isSSA() && "Must be run on SSA");
10080
10081 auto *TRI = MRI.getTargetRegisterInfo();
10082 auto *DefBB = DefMI.getParent();
10083
10084 // Don't bother searching between blocks, although it is possible this block
10085 // doesn't modify exec.
10086 if (UseMI.getParent() != DefBB)
10087 return true;
10088
10089 const int MaxInstScan = 20;
10090 int NumInst = 0;
10091
10092 // Stop scan at the use.
10093 auto E = UseMI.getIterator();
10094 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10095 if (I->isDebugInstr())
10096 continue;
10097
10098 if (++NumInst > MaxInstScan)
10099 return true;
10100
10101 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10102 return true;
10103 }
10104
10105 return false;
10106}
10107
10109 Register VReg,
10110 const MachineInstr &DefMI) {
10111 assert(MRI.isSSA() && "Must be run on SSA");
10112
10113 auto *TRI = MRI.getTargetRegisterInfo();
10114 auto *DefBB = DefMI.getParent();
10115
10116 const int MaxUseScan = 10;
10117 int NumUse = 0;
10118
10119 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10120 auto &UseInst = *Use.getParent();
10121 // Don't bother searching between blocks, although it is possible this block
10122 // doesn't modify exec.
10123 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10124 return true;
10125
10126 if (++NumUse > MaxUseScan)
10127 return true;
10128 }
10129
10130 if (NumUse == 0)
10131 return false;
10132
10133 const int MaxInstScan = 20;
10134 int NumInst = 0;
10135
10136 // Stop scan when we have seen all the uses.
10137 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10138 assert(I != DefBB->end());
10139
10140 if (I->isDebugInstr())
10141 continue;
10142
10143 if (++NumInst > MaxInstScan)
10144 return true;
10145
10146 for (const MachineOperand &Op : I->operands()) {
10147 // We don't check reg masks here as they're used only on calls:
10148 // 1. EXEC is only considered const within one BB
10149 // 2. Call should be a terminator instruction if present in a BB
10150
10151 if (!Op.isReg())
10152 continue;
10153
10154 Register Reg = Op.getReg();
10155 if (Op.isUse()) {
10156 if (Reg == VReg && --NumUse == 0)
10157 return false;
10158 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10159 return true;
10160 }
10161 }
10162}
10163
10166 const DebugLoc &DL, Register Src, Register Dst) const {
10167 auto Cur = MBB.begin();
10168 if (Cur != MBB.end())
10169 do {
10170 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10171 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10172 ++Cur;
10173 } while (Cur != MBB.end() && Cur != LastPHIIt);
10174
10175 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10176 Dst);
10177}
10178
10181 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10182 if (InsPt != MBB.end() &&
10183 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10184 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10185 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10186 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10187 InsPt++;
10188 return BuildMI(MBB, InsPt, DL,
10189 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
10190 : AMDGPU::S_MOV_B64_term),
10191 Dst)
10192 .addReg(Src, 0, SrcSubReg)
10193 .addReg(AMDGPU::EXEC, RegState::Implicit);
10194 }
10195 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10196 Dst);
10197}
10198
10199bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10200
10203 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10204 VirtRegMap *VRM) const {
10205 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10206 //
10207 // %0:sreg_32 = COPY $m0
10208 //
10209 // We explicitly chose SReg_32 for the virtual register so such a copy might
10210 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10211 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10212 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10213 // TargetInstrInfo::foldMemoryOperand() is going to try.
10214 // A similar issue also exists with spilling and reloading $exec registers.
10215 //
10216 // To prevent that, constrain the %0 register class here.
10217 if (isFullCopyInstr(MI)) {
10218 Register DstReg = MI.getOperand(0).getReg();
10219 Register SrcReg = MI.getOperand(1).getReg();
10220 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10221 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10223 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10224 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10225 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10226 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10227 return nullptr;
10228 }
10229 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10230 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10231 return nullptr;
10232 }
10233 }
10234 }
10235
10236 return nullptr;
10237}
10238
10240 const MachineInstr &MI,
10241 unsigned *PredCost) const {
10242 if (MI.isBundle()) {
10244 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10245 unsigned Lat = 0, Count = 0;
10246 for (++I; I != E && I->isBundledWithPred(); ++I) {
10247 ++Count;
10248 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10249 }
10250 return Lat + Count - 1;
10251 }
10252
10253 return SchedModel.computeInstrLatency(&MI);
10254}
10255
10258 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10259 unsigned opcode = MI.getOpcode();
10260
10261 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10262 Register Dst = MI.getOperand(0).getReg();
10263 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10264 : MI.getOperand(1).getReg();
10265 LLT DstTy = MRI.getType(Dst);
10266 LLT SrcTy = MRI.getType(Src);
10267 unsigned DstAS = DstTy.getAddressSpace();
10268 unsigned SrcAS = SrcTy.getAddressSpace();
10269 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10270 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10274 };
10275
10276 // If the target supports globally addressable scratch, the mapping from
10277 // scratch memory to the flat aperture changes therefore an address space cast
10278 // is no longer uniform.
10279 if (opcode == TargetOpcode::G_ADDRSPACE_CAST)
10280 return HandleAddrSpaceCast(MI);
10281
10282 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10283 auto IID = GI->getIntrinsicID();
10288
10289 switch (IID) {
10290 case Intrinsic::amdgcn_addrspacecast_nonnull:
10291 return HandleAddrSpaceCast(MI);
10292 case Intrinsic::amdgcn_if:
10293 case Intrinsic::amdgcn_else:
10294 // FIXME: Uniform if second result
10295 break;
10296 }
10297
10299 }
10300
10301 // Loads from the private and flat address spaces are divergent, because
10302 // threads can execute the load instruction with the same inputs and get
10303 // different results.
10304 //
10305 // All other loads are not divergent, because if threads issue loads with the
10306 // same arguments, they will always get the same result.
10307 if (opcode == AMDGPU::G_LOAD) {
10308 if (MI.memoperands_empty())
10309 return InstructionUniformity::NeverUniform; // conservative assumption
10310
10311 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10312 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10313 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10314 })) {
10315 // At least one MMO in a non-global address space.
10317 }
10319 }
10320
10321 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
10322 opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10323 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10324 AMDGPU::isGenericAtomic(opcode)) {
10326 }
10328}
10329
10332
10333 if (isNeverUniform(MI))
10335
10336 unsigned opcode = MI.getOpcode();
10337 if (opcode == AMDGPU::V_READLANE_B32 ||
10338 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10339 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10341
10342 if (isCopyInstr(MI)) {
10343 const MachineOperand &srcOp = MI.getOperand(1);
10344 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10345 const TargetRegisterClass *regClass =
10346 RI.getPhysRegBaseClass(srcOp.getReg());
10349 }
10351 }
10352
10353 // GMIR handling
10354 if (MI.isPreISelOpcode())
10356
10357 // Atomics are divergent because they are executed sequentially: when an
10358 // atomic operation refers to the same address in each thread, then each
10359 // thread after the first sees the value written by the previous thread as
10360 // original value.
10361
10362 if (isAtomic(MI))
10364
10365 // Loads from the private and flat address spaces are divergent, because
10366 // threads can execute the load instruction with the same inputs and get
10367 // different results.
10368 if (isFLAT(MI) && MI.mayLoad()) {
10369 if (MI.memoperands_empty())
10370 return InstructionUniformity::NeverUniform; // conservative assumption
10371
10372 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10373 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10374 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10375 })) {
10376 // At least one MMO in a non-global address space.
10378 }
10379
10381 }
10382
10383 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
10384 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10385
10386 // FIXME: It's conceptually broken to report this for an instruction, and not
10387 // a specific def operand. For inline asm in particular, there could be mixed
10388 // uniform and divergent results.
10389 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10390 const MachineOperand &SrcOp = MI.getOperand(I);
10391 if (!SrcOp.isReg())
10392 continue;
10393
10394 Register Reg = SrcOp.getReg();
10395 if (!Reg || !SrcOp.readsReg())
10396 continue;
10397
10398 // If RegBank is null, this is unassigned or an unallocatable special
10399 // register, which are all scalars.
10400 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10401 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10403 }
10404
10405 // TODO: Uniformity check condtions above can be rearranged for more
10406 // redability
10407
10408 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10409 // currently turned into no-op COPYs by SelectionDAG ISel and are
10410 // therefore no longer recognizable.
10411
10413}
10414
10416 switch (MF.getFunction().getCallingConv()) {
10418 return 1;
10420 return 2;
10422 return 3;
10426 const Function &F = MF.getFunction();
10427 F.getContext().diagnose(DiagnosticInfoUnsupported(
10428 F, "ds_ordered_count unsupported for this calling conv"));
10429 [[fallthrough]];
10430 }
10433 case CallingConv::C:
10434 case CallingConv::Fast:
10435 default:
10436 // Assume other calling conventions are various compute callable functions
10437 return 0;
10438 }
10439}
10440
10442 Register &SrcReg2, int64_t &CmpMask,
10443 int64_t &CmpValue) const {
10444 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10445 return false;
10446
10447 switch (MI.getOpcode()) {
10448 default:
10449 break;
10450 case AMDGPU::S_CMP_EQ_U32:
10451 case AMDGPU::S_CMP_EQ_I32:
10452 case AMDGPU::S_CMP_LG_U32:
10453 case AMDGPU::S_CMP_LG_I32:
10454 case AMDGPU::S_CMP_LT_U32:
10455 case AMDGPU::S_CMP_LT_I32:
10456 case AMDGPU::S_CMP_GT_U32:
10457 case AMDGPU::S_CMP_GT_I32:
10458 case AMDGPU::S_CMP_LE_U32:
10459 case AMDGPU::S_CMP_LE_I32:
10460 case AMDGPU::S_CMP_GE_U32:
10461 case AMDGPU::S_CMP_GE_I32:
10462 case AMDGPU::S_CMP_EQ_U64:
10463 case AMDGPU::S_CMP_LG_U64:
10464 SrcReg = MI.getOperand(0).getReg();
10465 if (MI.getOperand(1).isReg()) {
10466 if (MI.getOperand(1).getSubReg())
10467 return false;
10468 SrcReg2 = MI.getOperand(1).getReg();
10469 CmpValue = 0;
10470 } else if (MI.getOperand(1).isImm()) {
10471 SrcReg2 = Register();
10472 CmpValue = MI.getOperand(1).getImm();
10473 } else {
10474 return false;
10475 }
10476 CmpMask = ~0;
10477 return true;
10478 case AMDGPU::S_CMPK_EQ_U32:
10479 case AMDGPU::S_CMPK_EQ_I32:
10480 case AMDGPU::S_CMPK_LG_U32:
10481 case AMDGPU::S_CMPK_LG_I32:
10482 case AMDGPU::S_CMPK_LT_U32:
10483 case AMDGPU::S_CMPK_LT_I32:
10484 case AMDGPU::S_CMPK_GT_U32:
10485 case AMDGPU::S_CMPK_GT_I32:
10486 case AMDGPU::S_CMPK_LE_U32:
10487 case AMDGPU::S_CMPK_LE_I32:
10488 case AMDGPU::S_CMPK_GE_U32:
10489 case AMDGPU::S_CMPK_GE_I32:
10490 SrcReg = MI.getOperand(0).getReg();
10491 SrcReg2 = Register();
10492 CmpValue = MI.getOperand(1).getImm();
10493 CmpMask = ~0;
10494 return true;
10495 }
10496
10497 return false;
10498}
10499
10501 Register SrcReg2, int64_t CmpMask,
10502 int64_t CmpValue,
10503 const MachineRegisterInfo *MRI) const {
10504 if (!SrcReg || SrcReg.isPhysical())
10505 return false;
10506
10507 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10508 return false;
10509
10510 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
10511 this](int64_t ExpectedValue, unsigned SrcSize,
10512 bool IsReversible, bool IsSigned) -> bool {
10513 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10514 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10515 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10516 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10517 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
10518 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10519 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10520 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10521 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10522 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
10523 //
10524 // Signed ge/gt are not used for the sign bit.
10525 //
10526 // If result of the AND is unused except in the compare:
10527 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
10528 //
10529 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10530 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10531 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
10532 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10533 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10534 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
10535
10536 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
10537 if (!Def || Def->getParent() != CmpInstr.getParent())
10538 return false;
10539
10540 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
10541 Def->getOpcode() != AMDGPU::S_AND_B64)
10542 return false;
10543
10544 int64_t Mask;
10545 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
10546 if (MO->isImm())
10547 Mask = MO->getImm();
10548 else if (!getFoldableImm(MO, Mask))
10549 return false;
10550 Mask &= maxUIntN(SrcSize);
10551 return isPowerOf2_64(Mask);
10552 };
10553
10554 MachineOperand *SrcOp = &Def->getOperand(1);
10555 if (isMask(SrcOp))
10556 SrcOp = &Def->getOperand(2);
10557 else if (isMask(&Def->getOperand(2)))
10558 SrcOp = &Def->getOperand(1);
10559 else
10560 return false;
10561
10562 // A valid Mask is required to have a single bit set, hence a non-zero and
10563 // power-of-two value. This verifies that we will not do 64-bit shift below.
10564 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
10565 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
10566 if (IsSigned && BitNo == SrcSize - 1)
10567 return false;
10568
10569 ExpectedValue <<= BitNo;
10570
10571 bool IsReversedCC = false;
10572 if (CmpValue != ExpectedValue) {
10573 if (!IsReversible)
10574 return false;
10575 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
10576 if (!IsReversedCC)
10577 return false;
10578 }
10579
10580 Register DefReg = Def->getOperand(0).getReg();
10581 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
10582 return false;
10583
10584 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
10585 I != E; ++I) {
10586 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
10587 I->killsRegister(AMDGPU::SCC, &RI))
10588 return false;
10589 }
10590
10591 MachineOperand *SccDef =
10592 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
10593 SccDef->setIsDead(false);
10594 CmpInstr.eraseFromParent();
10595
10596 if (!MRI->use_nodbg_empty(DefReg)) {
10597 assert(!IsReversedCC);
10598 return true;
10599 }
10600
10601 // Replace AND with unused result with a S_BITCMP.
10602 MachineBasicBlock *MBB = Def->getParent();
10603
10604 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
10605 : AMDGPU::S_BITCMP1_B32
10606 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
10607 : AMDGPU::S_BITCMP1_B64;
10608
10609 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
10610 .add(*SrcOp)
10611 .addImm(BitNo);
10612 Def->eraseFromParent();
10613
10614 return true;
10615 };
10616
10617 switch (CmpInstr.getOpcode()) {
10618 default:
10619 break;
10620 case AMDGPU::S_CMP_EQ_U32:
10621 case AMDGPU::S_CMP_EQ_I32:
10622 case AMDGPU::S_CMPK_EQ_U32:
10623 case AMDGPU::S_CMPK_EQ_I32:
10624 return optimizeCmpAnd(1, 32, true, false);
10625 case AMDGPU::S_CMP_GE_U32:
10626 case AMDGPU::S_CMPK_GE_U32:
10627 return optimizeCmpAnd(1, 32, false, false);
10628 case AMDGPU::S_CMP_GE_I32:
10629 case AMDGPU::S_CMPK_GE_I32:
10630 return optimizeCmpAnd(1, 32, false, true);
10631 case AMDGPU::S_CMP_EQ_U64:
10632 return optimizeCmpAnd(1, 64, true, false);
10633 case AMDGPU::S_CMP_LG_U32:
10634 case AMDGPU::S_CMP_LG_I32:
10635 case AMDGPU::S_CMPK_LG_U32:
10636 case AMDGPU::S_CMPK_LG_I32:
10637 return optimizeCmpAnd(0, 32, true, false);
10638 case AMDGPU::S_CMP_GT_U32:
10639 case AMDGPU::S_CMPK_GT_U32:
10640 return optimizeCmpAnd(0, 32, false, false);
10641 case AMDGPU::S_CMP_GT_I32:
10642 case AMDGPU::S_CMPK_GT_I32:
10643 return optimizeCmpAnd(0, 32, false, true);
10644 case AMDGPU::S_CMP_LG_U64:
10645 return optimizeCmpAnd(0, 64, true, false);
10646 }
10647
10648 return false;
10649}
10650
10652 AMDGPU::OpName OpName) const {
10653 if (!ST.needsAlignedVGPRs())
10654 return;
10655
10656 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
10657 if (OpNo < 0)
10658 return;
10659 MachineOperand &Op = MI.getOperand(OpNo);
10660 if (getOpSize(MI, OpNo) > 4)
10661 return;
10662
10663 // Add implicit aligned super-reg to force alignment on the data operand.
10664 const DebugLoc &DL = MI.getDebugLoc();
10665 MachineBasicBlock *BB = MI.getParent();
10667 Register DataReg = Op.getReg();
10668 bool IsAGPR = RI.isAGPR(MRI, DataReg);
10669 Register Undef = MRI.createVirtualRegister(
10670 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10671 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
10672 Register NewVR =
10673 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10674 : &AMDGPU::VReg_64_Align2RegClass);
10675 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
10676 .addReg(DataReg, 0, Op.getSubReg())
10677 .addImm(AMDGPU::sub0)
10678 .addReg(Undef)
10679 .addImm(AMDGPU::sub1);
10680 Op.setReg(NewVR);
10681 Op.setSubReg(AMDGPU::sub0);
10682 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
10683}
10684
10686 if (isIGLP(*MI))
10687 return false;
10688
10690}
10691
10693 if (!isWMMA(MI) && !isSWMMAC(MI))
10694 return false;
10695
10696 if (AMDGPU::isGFX1250(ST))
10697 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
10698
10699 return true;
10700}
10701
10703 unsigned Opcode = MI.getOpcode();
10704
10705 if (AMDGPU::isGFX12Plus(ST))
10706 return isDOT(MI) || isXDLWMMA(MI);
10707
10708 if (!isMAI(MI) || isDGEMM(Opcode) ||
10709 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
10710 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
10711 return false;
10712
10713 if (!ST.hasGFX940Insts())
10714 return true;
10715
10716 return AMDGPU::getMAIIsGFX940XDL(Opcode);
10717}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MCInstrDesc &TID, unsigned RCID, bool IsAllocatable)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
Definition: SIInstrInfo.cpp:83
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
Definition: SIInstrInfo.cpp:74
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:480
#define LLVM_DEBUG(...)
Definition: Debug.h:119
bool hasBF16PackedInsts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool has16BitInsts() const
bool hasInv2PiInlineImm() const
Class for arbitrary precision integers.
Definition: APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1562
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:150
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:359
bool useVGPRIndexMode() const
bool hasSDWAOmod() const
Definition: GCNSubtarget.h:801
bool hasFlatGVSMode() const
bool hasA16() const
bool hasSDWAScalar() const
Definition: GCNSubtarget.h:805
bool hasScalarCompareEq64() const
bool hasOnlyRevVALUShifts() const
Definition: GCNSubtarget.h:435
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:679
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:821
bool hasMAIInsts() const
Definition: GCNSubtarget.h:879
bool hasFmaakFmamkF64Insts() const
bool hasScaleOffset() const
bool hasMFMAInlineLiteralBug() const
bool hasNegativeScratchOffsetBug() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasVALUMaskWriteHazard() const
bool hasGFX1250Insts() const
bool hasPkMovB32() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasR128A16() const
bool hasOffset3fBug() const
bool hasGetPCZeroExtension() const
bool hasAddPC64Inst() const
bool hasGloballyAddressableScratch() const
const AMDGPURegisterBankInfo * getRegBankInfo() const override
Definition: GCNSubtarget.h:343
bool has64BitLiterals() const
bool hasSDWAOutModsVOPC() const
Definition: GCNSubtarget.h:817
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:736
bool hasGFX940Insts() const
bool hasSDWASdst() const
Definition: GCNSubtarget.h:809
bool hasVALUReadSGPRHazard() const
bool hasMovB64() const
bool isWave32() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:384
bool hasNegativeUnalignedScratchOffsetBug() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasNoF16PseudoScalarTransInlineConstants() const
bool hasVectorMulU64() const
Generation getGeneration() const
Definition: GCNSubtarget.h:357
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:996
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:788
bool hasAddr64() const
Definition: GCNSubtarget.h:425
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:780
bool hasGDS() const
bool hasPartialNSAEncoding() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
A possibly irreducible generalization of a Loop.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:271
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:690
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:158
LLVM_ABI iterator find(SlotIndex Pos)
find - Return an iterator pointing to the first segment that ends after Pos, or end().
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:199
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:238
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:240
bool mayStore() const
Return true if this instruction could possibly modify memory.
Definition: MCInstrDesc.h:446
bool mayLoad() const
Return true if this instruction could possibly read memory.
Definition: MCInstrDesc.h:440
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
Definition: MCInstrDesc.h:249
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
Definition: MCInstrDesc.h:607
unsigned short Opcode
Definition: MCInstrDesc.h:206
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
Definition: MCInstrDesc.h:567
unsigned getOpcode() const
Return the opcode number for this descriptor.
Definition: MCInstrDesc.h:231
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition: MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:92
bool isGenericType() const
Definition: MCInstrDesc.h:119
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition: MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition: MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
unsigned pred_size() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
reverse_iterator rend()
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:72
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:587
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:359
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:590
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
Definition: MachineInstr.h:702
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:813
mop_range explicit_operands()
Definition: MachineInstr.h:696
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:798
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:780
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:511
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:595
uint32_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:404
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:29
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:46
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition: Register.h:102
constexpr bool isValid() const
Definition: Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:78
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:586
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
Definition: SIInstrInfo.h:982
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
Definition: SIInstrInfo.h:1258
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:504
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
Definition: SIInstrInfo.h:1023
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:668
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:576
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:568
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
Definition: SIInstrInfo.h:889
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:701
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:440
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:528
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
Definition: SIInstrInfo.h:857
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
Definition: SIInstrInfo.h:873
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:552
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:620
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:480
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:602
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:660
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isImage(const MachineInstr &MI)
Definition: SIInstrInfo.h:456
static bool isSOPK(const MachineInstr &MI)
Definition: SIInstrInfo.h:496
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:1036
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:560
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:650
static bool isVOP3(const MCInstrDesc &Desc)
Definition: SIInstrInfo.h:536
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
Definition: SIInstrInfo.h:1007
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:812
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
Definition: SIInstrInfo.h:844
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:768
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
Definition: SIInstrInfo.h:800
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
Definition: SIInstrInfo.h:1410
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const override
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:733
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool sopkIsZext(unsigned Opcode)
Definition: SIInstrInfo.h:921
static bool isSGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:780
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:861
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:488
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:644
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:448
bool isBarrier(unsigned Opcode) const
Definition: SIInstrInfo.h:997
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
Definition: SIInstrInfo.h:1424
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
Definition: SIInstrInfo.h:938
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:594
static bool isVOP1(const MachineInstr &MI)
Definition: SIInstrInfo.h:520
SIInstrInfo(const GCNSubtarget &ST)
Definition: SIInstrInfo.cpp:64
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
static bool hasVectorRegisters(const TargetRegisterClass *RC)
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool opCanUseInlineConstant(unsigned OpType) const
bool isVectorRegister(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
bool opCanUseLiteralConstant(unsigned OpType) const
static bool hasVGPRs(const TargetRegisterClass *RC)
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
const TargetRegisterClass * getBoolRC() const
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getChannelFromSubReg(unsigned SubReg) const
MCRegister getVCC() const
bool isVectorSuperClass(const TargetRegisterClass *RC) const
static bool hasAGPRs(const TargetRegisterClass *RC)
const TargetRegisterClass * getWaveMaskRegClass() const
bool spillSGPRToVGPR() const
const TargetRegisterClass * getVGPR64Class() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:586
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition: SlotIndexes.h:238
SlotIndexes pass.
Definition: SlotIndexes.h:298
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:532
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:283
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
const TargetRegisterClass * getAllocatableClass(const TargetRegisterClass *RC) const
Return the maximal subclass of the given register class that is allocatable or NULL.
unsigned getSubRegIdxSize(unsigned Idx) const
Get the size of the bit range covered by a sub-register index.
unsigned getSubRegIdxOffset(unsigned Idx) const
Get the offset of the bit range covered by a sub-register index.
LLVM_ABI void init(const TargetSubtargetInfo *TSInfo, bool EnableSModel=true, bool EnableSItins=true)
Initialize the machine model for instruction scheduling.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
LLVM Value Representation.
Definition: Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:174
self_iterator getIterator()
Definition: ilist_node.h:134
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
Definition: SIInstrInfo.h:1698
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
Definition: SIInstrInfo.h:1699
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
Definition: SIInstrInfo.h:1701
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand? These include registers, inline constants,...
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST)
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition: SIDefines.h:231
@ OPERAND_REG_IMM_INT64
Definition: SIDefines.h:202
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition: SIDefines.h:222
@ OPERAND_REG_INLINE_C_BF16
Definition: SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:210
@ OPERAND_REG_IMM_BF16
Definition: SIDefines.h:206
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition: SIDefines.h:201
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:208
@ OPERAND_REG_IMM_FP16
Definition: SIDefines.h:207
@ OPERAND_REG_INLINE_C_INT64
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition: SIDefines.h:216
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_IMM_FP64
Definition: SIDefines.h:205
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition: SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition: SIDefines.h:237
@ OPERAND_REG_IMM_V2INT32
Definition: SIDefines.h:212
@ OPERAND_SDWA_VOPC_DST
Definition: SIDefines.h:248
@ OPERAND_REG_IMM_FP32
Definition: SIDefines.h:204
@ OPERAND_INPUT_MODS
Definition: SIDefines.h:245
@ OPERAND_REG_INLINE_C_FP32
Definition: SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:223
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition: SIDefines.h:242
@ OPERAND_REG_IMM_V2FP32
Definition: SIDefines.h:213
@ OPERAND_REG_INLINE_AC_FP64
Definition: SIDefines.h:238
@ OPERAND_REG_INLINE_C_FP16
Definition: SIDefines.h:220
@ OPERAND_REG_IMM_INT16
Definition: SIDefines.h:203
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition: SIDefines.h:228
@ TI_SCRATCH_RSRC_DWORD1
Definition: AMDGPU.h:566
@ TI_SCRATCH_RSRC_DWORD3
Definition: AMDGPU.h:568
@ TI_SCRATCH_RSRC_DWORD0
Definition: AMDGPU.h:565
@ TI_SCRATCH_RSRC_DWORD2
Definition: AMDGPU.h:567
@ TI_CONSTDATA_START
Definition: AMDGPU.h:564
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
Definition: SIInstrInfo.h:1700
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_GENERIC_4
Definition: MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition: MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition: MCInstrDesc.h:68
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:62
@ OPERAND_GENERIC_3
Definition: MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition: MCInstrDesc.h:61
@ OPERAND_MEMORY
Definition: MCInstrDesc.h:63
@ OPERAND_UNKNOWN
Definition: MCInstrDesc.h:60
@ OPERAND_GENERIC_0
Definition: MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition: MCInstrDesc.h:72
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:477
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1584
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:307
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:216
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:663
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:551
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:293
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:157
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:336
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
Definition: SPIRVUtils.cpp:976
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:164
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
Definition: TargetOpcodes.h:36
DWARFExpression::Operation Op
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition: SIInstrInfo.h:40
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
@ DS_Error
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:257
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1916
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition: Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
@ NeverUniform
The result values can never be assumed to be uniform.
@ Default
The result values are uniform if and only if all operands are uniform.
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
#define N
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition: APFloat.cpp:219
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Description of the encoding of one expression Op.
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
Definition: LiveVariables.h:84
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition: SIInstrInfo.h:52
MachineInstr * top() const
Definition: SIInstrInfo.h:57
bool empty() const
Definition: SIInstrInfo.h:67
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition: SIInstrInfo.h:76
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.