LLVM 22.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
21#include "llvm/ADT/STLExtras.h"
32#include "llvm/IR/IntrinsicsAMDGPU.h"
33#include "llvm/MC/MCContext.h"
36
37using namespace llvm;
38
39#define DEBUG_TYPE "si-instr-info"
40
41#define GET_INSTRINFO_CTOR_DTOR
42#include "AMDGPUGenInstrInfo.inc"
43
44namespace llvm::AMDGPU {
45#define GET_D16ImageDimIntrinsics_IMPL
46#define GET_ImageDimIntrinsicTable_IMPL
47#define GET_RsrcIntrinsics_IMPL
48#include "AMDGPUGenSearchableTables.inc"
49} // namespace llvm::AMDGPU
50
51// Must be at least 4 to be able to branch over minimum unconditional branch
52// code. This is only for making it possible to write reasonably small tests for
53// long branches.
55BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
56 cl::desc("Restrict range of branch instructions (DEBUG)"));
57
59 "amdgpu-fix-16-bit-physreg-copies",
60 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
61 cl::init(true),
63
65 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
66 RI(ST), ST(ST) {
67 SchedModel.init(&ST);
68}
69
70//===----------------------------------------------------------------------===//
71// TargetInstrInfo callbacks
72//===----------------------------------------------------------------------===//
73
74static unsigned getNumOperandsNoGlue(SDNode *Node) {
75 unsigned N = Node->getNumOperands();
76 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
77 --N;
78 return N;
79}
80
81/// Returns true if both nodes have the same value for the given
82/// operand \p Op, or if both nodes do not have this operand.
84 AMDGPU::OpName OpName) {
85 unsigned Opc0 = N0->getMachineOpcode();
86 unsigned Opc1 = N1->getMachineOpcode();
87
88 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
89 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
90
91 if (Op0Idx == -1 && Op1Idx == -1)
92 return true;
93
94
95 if ((Op0Idx == -1 && Op1Idx != -1) ||
96 (Op1Idx == -1 && Op0Idx != -1))
97 return false;
98
99 // getNamedOperandIdx returns the index for the MachineInstr's operands,
100 // which includes the result as the first operand. We are indexing into the
101 // MachineSDNode's operands, so we need to skip the result operand to get
102 // the real index.
103 --Op0Idx;
104 --Op1Idx;
105
106 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
107}
108
109static bool canRemat(const MachineInstr &MI) {
110
114 return true;
115
116 if (SIInstrInfo::isSMRD(MI)) {
117 return !MI.memoperands_empty() &&
118 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
119 return MMO->isLoad() && MMO->isInvariant();
120 });
121 }
122
123 return false;
124}
125
127 const MachineInstr &MI) const {
128
129 if (canRemat(MI)) {
130 // Normally VALU use of exec would block the rematerialization, but that
131 // is OK in this case to have an implicit exec read as all VALU do.
132 // We really want all of the generic logic for this except for this.
133
134 // Another potential implicit use is mode register. The core logic of
135 // the RA will not attempt rematerialization if mode is set anywhere
136 // in the function, otherwise it is safe since mode is not changed.
137
138 // There is difference to generic method which does not allow
139 // rematerialization if there are virtual register uses. We allow this,
140 // therefore this method includes SOP instructions as well.
141 if (!MI.hasImplicitDef() &&
142 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
143 !MI.mayRaiseFPException())
144 return true;
145 }
146
148}
149
150// Returns true if the scalar result of a VALU instruction depends on exec.
151bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
152 // Ignore comparisons which are only used masked with exec.
153 // This allows some hoisting/sinking of VALU comparisons.
154 if (MI.isCompare()) {
155 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
156 if (!Dst)
157 return true;
158
159 Register DstReg = Dst->getReg();
160 if (!DstReg.isVirtual())
161 return true;
162
163 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
164 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
165 switch (Use.getOpcode()) {
166 case AMDGPU::S_AND_SAVEEXEC_B32:
167 case AMDGPU::S_AND_SAVEEXEC_B64:
168 break;
169 case AMDGPU::S_AND_B32:
170 case AMDGPU::S_AND_B64:
171 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
172 return true;
173 break;
174 default:
175 return true;
176 }
177 }
178 return false;
179 }
180
181 switch (MI.getOpcode()) {
182 default:
183 break;
184 case AMDGPU::V_READFIRSTLANE_B32:
185 return true;
186 }
187
188 return false;
189}
190
192 // Any implicit use of exec by VALU is not a real register read.
193 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
194 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
195}
196
198 MachineBasicBlock *SuccToSinkTo,
199 MachineCycleInfo *CI) const {
200 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
201 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
202 return true;
203
204 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
205 // Check if sinking of MI would create temporal divergent use.
206 for (auto Op : MI.uses()) {
207 if (Op.isReg() && Op.getReg().isVirtual() &&
208 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
209 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
210
211 // SgprDef defined inside cycle
212 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
213 if (FromCycle == nullptr)
214 continue;
215
216 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
217 // Check if there is a FromCycle that contains SgprDef's basic block but
218 // does not contain SuccToSinkTo and also has divergent exit condition.
219 while (FromCycle && !FromCycle->contains(ToCycle)) {
221 FromCycle->getExitingBlocks(ExitingBlocks);
222
223 // FromCycle has divergent exit condition.
224 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
225 if (hasDivergentBranch(ExitingBlock))
226 return false;
227 }
228
229 FromCycle = FromCycle->getParentCycle();
230 }
231 }
232 }
233
234 return true;
235}
236
238 int64_t &Offset0,
239 int64_t &Offset1) const {
240 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
241 return false;
242
243 unsigned Opc0 = Load0->getMachineOpcode();
244 unsigned Opc1 = Load1->getMachineOpcode();
245
246 // Make sure both are actually loads.
247 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
248 return false;
249
250 // A mayLoad instruction without a def is not a load. Likely a prefetch.
251 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
252 return false;
253
254 if (isDS(Opc0) && isDS(Opc1)) {
255
256 // FIXME: Handle this case:
257 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
258 return false;
259
260 // Check base reg.
261 if (Load0->getOperand(0) != Load1->getOperand(0))
262 return false;
263
264 // Skip read2 / write2 variants for simplicity.
265 // TODO: We should report true if the used offsets are adjacent (excluded
266 // st64 versions).
267 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
268 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
269 if (Offset0Idx == -1 || Offset1Idx == -1)
270 return false;
271
272 // XXX - be careful of dataless loads
273 // getNamedOperandIdx returns the index for MachineInstrs. Since they
274 // include the output in the operand list, but SDNodes don't, we need to
275 // subtract the index by one.
276 Offset0Idx -= get(Opc0).NumDefs;
277 Offset1Idx -= get(Opc1).NumDefs;
278 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
279 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
280 return true;
281 }
282
283 if (isSMRD(Opc0) && isSMRD(Opc1)) {
284 // Skip time and cache invalidation instructions.
285 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
286 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
287 return false;
288
289 unsigned NumOps = getNumOperandsNoGlue(Load0);
290 if (NumOps != getNumOperandsNoGlue(Load1))
291 return false;
292
293 // Check base reg.
294 if (Load0->getOperand(0) != Load1->getOperand(0))
295 return false;
296
297 // Match register offsets, if both register and immediate offsets present.
298 assert(NumOps == 4 || NumOps == 5);
299 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
300 return false;
301
302 const ConstantSDNode *Load0Offset =
303 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
304 const ConstantSDNode *Load1Offset =
305 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
306
307 if (!Load0Offset || !Load1Offset)
308 return false;
309
310 Offset0 = Load0Offset->getZExtValue();
311 Offset1 = Load1Offset->getZExtValue();
312 return true;
313 }
314
315 // MUBUF and MTBUF can access the same addresses.
316 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
317
318 // MUBUF and MTBUF have vaddr at different indices.
319 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
320 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
321 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
322 return false;
323
324 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
325 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
326
327 if (OffIdx0 == -1 || OffIdx1 == -1)
328 return false;
329
330 // getNamedOperandIdx returns the index for MachineInstrs. Since they
331 // include the output in the operand list, but SDNodes don't, we need to
332 // subtract the index by one.
333 OffIdx0 -= get(Opc0).NumDefs;
334 OffIdx1 -= get(Opc1).NumDefs;
335
336 SDValue Off0 = Load0->getOperand(OffIdx0);
337 SDValue Off1 = Load1->getOperand(OffIdx1);
338
339 // The offset might be a FrameIndexSDNode.
340 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
341 return false;
342
343 Offset0 = Off0->getAsZExtVal();
344 Offset1 = Off1->getAsZExtVal();
345 return true;
346 }
347
348 return false;
349}
350
351static bool isStride64(unsigned Opc) {
352 switch (Opc) {
353 case AMDGPU::DS_READ2ST64_B32:
354 case AMDGPU::DS_READ2ST64_B64:
355 case AMDGPU::DS_WRITE2ST64_B32:
356 case AMDGPU::DS_WRITE2ST64_B64:
357 return true;
358 default:
359 return false;
360 }
361}
362
365 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
366 const TargetRegisterInfo *TRI) const {
367 if (!LdSt.mayLoadOrStore())
368 return false;
369
370 unsigned Opc = LdSt.getOpcode();
371 OffsetIsScalable = false;
372 const MachineOperand *BaseOp, *OffsetOp;
373 int DataOpIdx;
374
375 if (isDS(LdSt)) {
376 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
377 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
378 if (OffsetOp) {
379 // Normal, single offset LDS instruction.
380 if (!BaseOp) {
381 // DS_CONSUME/DS_APPEND use M0 for the base address.
382 // TODO: find the implicit use operand for M0 and use that as BaseOp?
383 return false;
384 }
385 BaseOps.push_back(BaseOp);
386 Offset = OffsetOp->getImm();
387 // Get appropriate operand, and compute width accordingly.
388 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
389 if (DataOpIdx == -1)
390 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
391 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
392 Width = LocationSize::precise(64);
393 else
394 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
395 } else {
396 // The 2 offset instructions use offset0 and offset1 instead. We can treat
397 // these as a load with a single offset if the 2 offsets are consecutive.
398 // We will use this for some partially aligned loads.
399 const MachineOperand *Offset0Op =
400 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
401 const MachineOperand *Offset1Op =
402 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
403
404 unsigned Offset0 = Offset0Op->getImm() & 0xff;
405 unsigned Offset1 = Offset1Op->getImm() & 0xff;
406 if (Offset0 + 1 != Offset1)
407 return false;
408
409 // Each of these offsets is in element sized units, so we need to convert
410 // to bytes of the individual reads.
411
412 unsigned EltSize;
413 if (LdSt.mayLoad())
414 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
415 else {
416 assert(LdSt.mayStore());
417 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
418 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
419 }
420
421 if (isStride64(Opc))
422 EltSize *= 64;
423
424 BaseOps.push_back(BaseOp);
425 Offset = EltSize * Offset0;
426 // Get appropriate operand(s), and compute width accordingly.
427 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
428 if (DataOpIdx == -1) {
429 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
430 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
431 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
432 Width = LocationSize::precise(
433 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
434 } else {
435 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
436 }
437 }
438 return true;
439 }
440
441 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
442 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
443 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
444 return false;
445 BaseOps.push_back(RSrc);
446 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
447 if (BaseOp && !BaseOp->isFI())
448 BaseOps.push_back(BaseOp);
449 const MachineOperand *OffsetImm =
450 getNamedOperand(LdSt, AMDGPU::OpName::offset);
451 Offset = OffsetImm->getImm();
452 const MachineOperand *SOffset =
453 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
454 if (SOffset) {
455 if (SOffset->isReg())
456 BaseOps.push_back(SOffset);
457 else
458 Offset += SOffset->getImm();
459 }
460 // Get appropriate operand, and compute width accordingly.
461 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
462 if (DataOpIdx == -1)
463 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
464 if (DataOpIdx == -1) // LDS DMA
465 return false;
466 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
467 return true;
468 }
469
470 if (isImage(LdSt)) {
471 auto RsrcOpName =
472 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
473 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
474 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
475 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
476 if (VAddr0Idx >= 0) {
477 // GFX10 possible NSA encoding.
478 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
479 BaseOps.push_back(&LdSt.getOperand(I));
480 } else {
481 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
482 }
483 Offset = 0;
484 // Get appropriate operand, and compute width accordingly.
485 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
486 if (DataOpIdx == -1)
487 return false; // no return sampler
488 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
489 return true;
490 }
491
492 if (isSMRD(LdSt)) {
493 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
494 if (!BaseOp) // e.g. S_MEMTIME
495 return false;
496 BaseOps.push_back(BaseOp);
497 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
498 Offset = OffsetOp ? OffsetOp->getImm() : 0;
499 // Get appropriate operand, and compute width accordingly.
500 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
501 if (DataOpIdx == -1)
502 return false;
503 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
504 return true;
505 }
506
507 if (isFLAT(LdSt)) {
508 // Instructions have either vaddr or saddr or both or none.
509 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
510 if (BaseOp)
511 BaseOps.push_back(BaseOp);
512 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
513 if (BaseOp)
514 BaseOps.push_back(BaseOp);
515 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
516 // Get appropriate operand, and compute width accordingly.
517 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
518 if (DataOpIdx == -1)
519 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
520 if (DataOpIdx == -1) // LDS DMA
521 return false;
522 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
523 return true;
524 }
525
526 return false;
527}
528
529static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
531 const MachineInstr &MI2,
533 // Only examine the first "base" operand of each instruction, on the
534 // assumption that it represents the real base address of the memory access.
535 // Other operands are typically offsets or indices from this base address.
536 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
537 return true;
538
539 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
540 return false;
541
542 auto *MO1 = *MI1.memoperands_begin();
543 auto *MO2 = *MI2.memoperands_begin();
544 if (MO1->getAddrSpace() != MO2->getAddrSpace())
545 return false;
546
547 const auto *Base1 = MO1->getValue();
548 const auto *Base2 = MO2->getValue();
549 if (!Base1 || !Base2)
550 return false;
551 Base1 = getUnderlyingObject(Base1);
552 Base2 = getUnderlyingObject(Base2);
553
554 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
555 return false;
556
557 return Base1 == Base2;
558}
559
561 int64_t Offset1, bool OffsetIsScalable1,
563 int64_t Offset2, bool OffsetIsScalable2,
564 unsigned ClusterSize,
565 unsigned NumBytes) const {
566 // If the mem ops (to be clustered) do not have the same base ptr, then they
567 // should not be clustered
568 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
569 if (!BaseOps1.empty() && !BaseOps2.empty()) {
570 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
571 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
572 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
573 return false;
574
575 const SIMachineFunctionInfo *MFI =
576 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
577 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
578 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
579 // If only one base op is empty, they do not have the same base ptr
580 return false;
581 }
582
583 // In order to avoid register pressure, on an average, the number of DWORDS
584 // loaded together by all clustered mem ops should not exceed
585 // MaxMemoryClusterDWords. This is an empirical value based on certain
586 // observations and performance related experiments.
587 // The good thing about this heuristic is - it avoids clustering of too many
588 // sub-word loads, and also avoids clustering of wide loads. Below is the
589 // brief summary of how the heuristic behaves for various `LoadSize` when
590 // MaxMemoryClusterDWords is 8.
591 //
592 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
593 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
594 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
595 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
596 // (5) LoadSize >= 17: do not cluster
597 const unsigned LoadSize = NumBytes / ClusterSize;
598 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
599 return NumDWords <= MaxMemoryClusterDWords;
600}
601
602// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
603// the first 16 loads will be interleaved with the stores, and the next 16 will
604// be clustered as expected. It should really split into 2 16 store batches.
605//
606// Loads are clustered until this returns false, rather than trying to schedule
607// groups of stores. This also means we have to deal with saying different
608// address space loads should be clustered, and ones which might cause bank
609// conflicts.
610//
611// This might be deprecated so it might not be worth that much effort to fix.
613 int64_t Offset0, int64_t Offset1,
614 unsigned NumLoads) const {
615 assert(Offset1 > Offset0 &&
616 "Second offset should be larger than first offset!");
617 // If we have less than 16 loads in a row, and the offsets are within 64
618 // bytes, then schedule together.
619
620 // A cacheline is 64 bytes (for global memory).
621 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
622}
623
626 const DebugLoc &DL, MCRegister DestReg,
627 MCRegister SrcReg, bool KillSrc,
628 const char *Msg = "illegal VGPR to SGPR copy") {
630
632 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
633
634 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
635 .addReg(SrcReg, getKillRegState(KillSrc));
636}
637
638/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
639/// possible to have a direct copy in these cases on GFX908, so an intermediate
640/// VGPR copy is required.
644 const DebugLoc &DL, MCRegister DestReg,
645 MCRegister SrcReg, bool KillSrc,
646 RegScavenger &RS, bool RegsOverlap,
647 Register ImpDefSuperReg = Register(),
648 Register ImpUseSuperReg = Register()) {
649 assert((TII.getSubtarget().hasMAIInsts() &&
650 !TII.getSubtarget().hasGFX90AInsts()) &&
651 "Expected GFX908 subtarget.");
652
653 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
654 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
655 "Source register of the copy should be either an SGPR or an AGPR.");
656
657 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
658 "Destination register of the copy should be an AGPR.");
659
660 const SIRegisterInfo &RI = TII.getRegisterInfo();
661
662 // First try to find defining accvgpr_write to avoid temporary registers.
663 // In the case of copies of overlapping AGPRs, we conservatively do not
664 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
665 // an accvgpr_write used for this same copy due to implicit-defs
666 if (!RegsOverlap) {
667 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
668 --Def;
669
670 if (!Def->modifiesRegister(SrcReg, &RI))
671 continue;
672
673 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
674 Def->getOperand(0).getReg() != SrcReg)
675 break;
676
677 MachineOperand &DefOp = Def->getOperand(1);
678 assert(DefOp.isReg() || DefOp.isImm());
679
680 if (DefOp.isReg()) {
681 bool SafeToPropagate = true;
682 // Check that register source operand is not clobbered before MI.
683 // Immediate operands are always safe to propagate.
684 for (auto I = Def; I != MI && SafeToPropagate; ++I)
685 if (I->modifiesRegister(DefOp.getReg(), &RI))
686 SafeToPropagate = false;
687
688 if (!SafeToPropagate)
689 break;
690
691 for (auto I = Def; I != MI; ++I)
692 I->clearRegisterKills(DefOp.getReg(), &RI);
693 }
694
695 MachineInstrBuilder Builder =
696 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
697 .add(DefOp);
698 if (ImpDefSuperReg)
699 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
700
701 if (ImpUseSuperReg) {
702 Builder.addReg(ImpUseSuperReg,
704 }
705
706 return;
707 }
708 }
709
711 RS.backward(std::next(MI));
712
713 // Ideally we want to have three registers for a long reg_sequence copy
714 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
715 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
716 *MBB.getParent());
717
718 // Registers in the sequence are allocated contiguously so we can just
719 // use register number to pick one of three round-robin temps.
720 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
721 Register Tmp =
722 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
724 "VGPR used for an intermediate copy should have been reserved.");
725
726 // Only loop through if there are any free registers left. We don't want to
727 // spill.
728 while (RegNo--) {
729 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
730 /* RestoreAfter */ false, 0,
731 /* AllowSpill */ false);
732 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
733 break;
734 Tmp = Tmp2;
735 RS.setRegUsed(Tmp);
736 }
737
738 // Insert copy to temporary VGPR.
739 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
740 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
741 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
742 } else {
743 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
744 }
745
746 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
747 .addReg(SrcReg, getKillRegState(KillSrc));
748 if (ImpUseSuperReg) {
749 UseBuilder.addReg(ImpUseSuperReg,
751 }
752
753 MachineInstrBuilder DefBuilder
754 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
755 .addReg(Tmp, RegState::Kill);
756
757 if (ImpDefSuperReg)
758 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
759}
760
763 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
764 const TargetRegisterClass *RC, bool Forward) {
765 const SIRegisterInfo &RI = TII.getRegisterInfo();
766 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
768 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
769
770 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
771 int16_t SubIdx = BaseIndices[Idx];
772 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
773 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
774 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
775 unsigned Opcode = AMDGPU::S_MOV_B32;
776
777 // Is SGPR aligned? If so try to combine with next.
778 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
779 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
780 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
781 // Can use SGPR64 copy
782 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
783 SubIdx = RI.getSubRegFromChannel(Channel, 2);
784 DestSubReg = RI.getSubReg(DestReg, SubIdx);
785 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
786 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
787 Opcode = AMDGPU::S_MOV_B64;
788 Idx++;
789 }
790
791 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
792 .addReg(SrcSubReg)
793 .addReg(SrcReg, RegState::Implicit);
794
795 if (!FirstMI)
796 FirstMI = LastMI;
797
798 if (!Forward)
799 I--;
800 }
801
802 assert(FirstMI && LastMI);
803 if (!Forward)
804 std::swap(FirstMI, LastMI);
805
806 FirstMI->addOperand(
807 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
808
809 if (KillSrc)
810 LastMI->addRegisterKilled(SrcReg, &RI);
811}
812
815 const DebugLoc &DL, Register DestReg,
816 Register SrcReg, bool KillSrc, bool RenamableDest,
817 bool RenamableSrc) const {
818 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
819 unsigned Size = RI.getRegSizeInBits(*RC);
820 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
821 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
822
823 // The rest of copyPhysReg assumes Src and Dst size are the same size.
824 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
825 // we remove Fix16BitCopies and this code block?
826 if (Fix16BitCopies) {
827 if (((Size == 16) != (SrcSize == 16))) {
828 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
830 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
831 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
832 RegToFix = SubReg;
833
834 if (DestReg == SrcReg) {
835 // Identity copy. Insert empty bundle since ExpandPostRA expects an
836 // instruction here.
837 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
838 return;
839 }
840 RC = RI.getPhysRegBaseClass(DestReg);
841 Size = RI.getRegSizeInBits(*RC);
842 SrcRC = RI.getPhysRegBaseClass(SrcReg);
843 SrcSize = RI.getRegSizeInBits(*SrcRC);
844 }
845 }
846
847 if (RC == &AMDGPU::VGPR_32RegClass) {
848 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
849 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
850 AMDGPU::AGPR_32RegClass.contains(SrcReg));
851 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
852 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
853 BuildMI(MBB, MI, DL, get(Opc), DestReg)
854 .addReg(SrcReg, getKillRegState(KillSrc));
855 return;
856 }
857
858 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
859 RC == &AMDGPU::SReg_32RegClass) {
860 if (SrcReg == AMDGPU::SCC) {
861 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
862 .addImm(1)
863 .addImm(0);
864 return;
865 }
866
867 if (DestReg == AMDGPU::VCC_LO) {
868 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
869 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
870 .addReg(SrcReg, getKillRegState(KillSrc));
871 } else {
872 // FIXME: Hack until VReg_1 removed.
873 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
874 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
875 .addImm(0)
876 .addReg(SrcReg, getKillRegState(KillSrc));
877 }
878
879 return;
880 }
881
882 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
883 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
884 return;
885 }
886
887 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
888 .addReg(SrcReg, getKillRegState(KillSrc));
889 return;
890 }
891
892 if (RC == &AMDGPU::SReg_64RegClass) {
893 if (SrcReg == AMDGPU::SCC) {
894 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
895 .addImm(1)
896 .addImm(0);
897 return;
898 }
899
900 if (DestReg == AMDGPU::VCC) {
901 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
902 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
903 .addReg(SrcReg, getKillRegState(KillSrc));
904 } else {
905 // FIXME: Hack until VReg_1 removed.
906 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
907 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
908 .addImm(0)
909 .addReg(SrcReg, getKillRegState(KillSrc));
910 }
911
912 return;
913 }
914
915 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
916 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
917 return;
918 }
919
920 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
921 .addReg(SrcReg, getKillRegState(KillSrc));
922 return;
923 }
924
925 if (DestReg == AMDGPU::SCC) {
926 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
927 // but SelectionDAG emits such copies for i1 sources.
928 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
929 // This copy can only be produced by patterns
930 // with explicit SCC, which are known to be enabled
931 // only for subtargets with S_CMP_LG_U64 present.
933 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
934 .addReg(SrcReg, getKillRegState(KillSrc))
935 .addImm(0);
936 } else {
937 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
938 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
939 .addReg(SrcReg, getKillRegState(KillSrc))
940 .addImm(0);
941 }
942
943 return;
944 }
945
946 if (RC == &AMDGPU::AGPR_32RegClass) {
947 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
948 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
949 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
950 .addReg(SrcReg, getKillRegState(KillSrc));
951 return;
952 }
953
954 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
955 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
956 .addReg(SrcReg, getKillRegState(KillSrc));
957 return;
958 }
959
960 // FIXME: Pass should maintain scavenger to avoid scan through the block on
961 // every AGPR spill.
962 RegScavenger RS;
963 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
964 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
965 return;
966 }
967
968 if (Size == 16) {
969 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
970 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
971 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
972
973 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
974 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
975 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
976 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
977 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
978 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
979 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
980 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
981
982 if (IsSGPRDst) {
983 if (!IsSGPRSrc) {
984 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
985 return;
986 }
987
988 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
989 .addReg(NewSrcReg, getKillRegState(KillSrc));
990 return;
991 }
992
993 if (IsAGPRDst || IsAGPRSrc) {
994 if (!DstLow || !SrcLow) {
995 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
996 "Cannot use hi16 subreg with an AGPR!");
997 }
998
999 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
1000 return;
1001 }
1002
1003 if (ST.useRealTrue16Insts()) {
1004 if (IsSGPRSrc) {
1005 assert(SrcLow);
1006 SrcReg = NewSrcReg;
1007 }
1008 // Use the smaller instruction encoding if possible.
1009 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1010 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1011 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1012 .addReg(SrcReg);
1013 } else {
1014 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1015 .addImm(0) // src0_modifiers
1016 .addReg(SrcReg)
1017 .addImm(0); // op_sel
1018 }
1019 return;
1020 }
1021
1022 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1023 if (!DstLow || !SrcLow) {
1024 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1025 "Cannot use hi16 subreg on VI!");
1026 }
1027
1028 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1029 .addReg(NewSrcReg, getKillRegState(KillSrc));
1030 return;
1031 }
1032
1033 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1034 .addImm(0) // src0_modifiers
1035 .addReg(NewSrcReg)
1036 .addImm(0) // clamp
1043 // First implicit operand is $exec.
1044 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1045 return;
1046 }
1047
1048 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1049 if (ST.hasMovB64()) {
1050 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1051 .addReg(SrcReg, getKillRegState(KillSrc));
1052 return;
1053 }
1054 if (ST.hasPkMovB32()) {
1055 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1057 .addReg(SrcReg)
1059 .addReg(SrcReg)
1060 .addImm(0) // op_sel_lo
1061 .addImm(0) // op_sel_hi
1062 .addImm(0) // neg_lo
1063 .addImm(0) // neg_hi
1064 .addImm(0) // clamp
1065 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1066 return;
1067 }
1068 }
1069
1070 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1071 if (RI.isSGPRClass(RC)) {
1072 if (!RI.isSGPRClass(SrcRC)) {
1073 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1074 return;
1075 }
1076 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1077 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1078 Forward);
1079 return;
1080 }
1081
1082 unsigned EltSize = 4;
1083 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1084 if (RI.isAGPRClass(RC)) {
1085 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1086 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1087 else if (RI.hasVGPRs(SrcRC) ||
1088 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1089 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1090 else
1091 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1092 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1093 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1094 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1095 (RI.isProperlyAlignedRC(*RC) &&
1096 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1097 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1098 if (ST.hasMovB64()) {
1099 Opcode = AMDGPU::V_MOV_B64_e32;
1100 EltSize = 8;
1101 } else if (ST.hasPkMovB32()) {
1102 Opcode = AMDGPU::V_PK_MOV_B32;
1103 EltSize = 8;
1104 }
1105 }
1106
1107 // For the cases where we need an intermediate instruction/temporary register
1108 // (destination is an AGPR), we need a scavenger.
1109 //
1110 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1111 // whole block for every handled copy.
1112 std::unique_ptr<RegScavenger> RS;
1113 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1114 RS = std::make_unique<RegScavenger>();
1115
1116 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1117
1118 // If there is an overlap, we can't kill the super-register on the last
1119 // instruction, since it will also kill the components made live by this def.
1120 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1121 const bool CanKillSuperReg = KillSrc && !Overlap;
1122
1123 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1124 unsigned SubIdx;
1125 if (Forward)
1126 SubIdx = SubIndices[Idx];
1127 else
1128 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1129 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1130 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1131 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1132
1133 bool IsFirstSubreg = Idx == 0;
1134 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1135
1136 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1137 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1138 Register ImpUseSuper = SrcReg;
1139 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1140 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1141 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1143 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1145 .addReg(SrcSubReg)
1147 .addReg(SrcSubReg)
1148 .addImm(0) // op_sel_lo
1149 .addImm(0) // op_sel_hi
1150 .addImm(0) // neg_lo
1151 .addImm(0) // neg_hi
1152 .addImm(0) // clamp
1153 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1154 if (IsFirstSubreg)
1156 } else {
1157 MachineInstrBuilder Builder =
1158 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1159 if (IsFirstSubreg)
1160 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1161
1162 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1163 }
1164 }
1165}
1166
1167int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1168 int NewOpc;
1169
1170 // Try to map original to commuted opcode
1171 NewOpc = AMDGPU::getCommuteRev(Opcode);
1172 if (NewOpc != -1)
1173 // Check if the commuted (REV) opcode exists on the target.
1174 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1175
1176 // Try to map commuted to original opcode
1177 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1178 if (NewOpc != -1)
1179 // Check if the original (non-REV) opcode exists on the target.
1180 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1181
1182 return Opcode;
1183}
1184
1185const TargetRegisterClass *
1187 return &AMDGPU::VGPR_32RegClass;
1188}
1189
1192 const DebugLoc &DL, Register DstReg,
1194 Register TrueReg,
1195 Register FalseReg) const {
1197 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1198 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1199 "Not a VGPR32 reg");
1200
1201 if (Cond.size() == 1) {
1202 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1203 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1204 .add(Cond[0]);
1205 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1206 .addImm(0)
1207 .addReg(FalseReg)
1208 .addImm(0)
1209 .addReg(TrueReg)
1210 .addReg(SReg);
1211 } else if (Cond.size() == 2) {
1212 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1213 switch (Cond[0].getImm()) {
1214 case SIInstrInfo::SCC_TRUE: {
1215 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1216 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1217 : AMDGPU::S_CSELECT_B64), SReg)
1218 .addImm(1)
1219 .addImm(0);
1220 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1221 .addImm(0)
1222 .addReg(FalseReg)
1223 .addImm(0)
1224 .addReg(TrueReg)
1225 .addReg(SReg);
1226 break;
1227 }
1228 case SIInstrInfo::SCC_FALSE: {
1229 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1230 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1231 : AMDGPU::S_CSELECT_B64), SReg)
1232 .addImm(0)
1233 .addImm(1);
1234 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1235 .addImm(0)
1236 .addReg(FalseReg)
1237 .addImm(0)
1238 .addReg(TrueReg)
1239 .addReg(SReg);
1240 break;
1241 }
1242 case SIInstrInfo::VCCNZ: {
1243 MachineOperand RegOp = Cond[1];
1244 RegOp.setImplicit(false);
1245 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1246 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1247 .add(RegOp);
1248 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1249 .addImm(0)
1250 .addReg(FalseReg)
1251 .addImm(0)
1252 .addReg(TrueReg)
1253 .addReg(SReg);
1254 break;
1255 }
1256 case SIInstrInfo::VCCZ: {
1257 MachineOperand RegOp = Cond[1];
1258 RegOp.setImplicit(false);
1259 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1260 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1261 .add(RegOp);
1262 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1263 .addImm(0)
1264 .addReg(TrueReg)
1265 .addImm(0)
1266 .addReg(FalseReg)
1267 .addReg(SReg);
1268 break;
1269 }
1270 case SIInstrInfo::EXECNZ: {
1271 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1272 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1273 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1274 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1275 .addImm(0);
1276 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1277 : AMDGPU::S_CSELECT_B64), SReg)
1278 .addImm(1)
1279 .addImm(0);
1280 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1281 .addImm(0)
1282 .addReg(FalseReg)
1283 .addImm(0)
1284 .addReg(TrueReg)
1285 .addReg(SReg);
1286 break;
1287 }
1288 case SIInstrInfo::EXECZ: {
1289 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1290 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1291 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1292 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1293 .addImm(0);
1294 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1295 : AMDGPU::S_CSELECT_B64), SReg)
1296 .addImm(0)
1297 .addImm(1);
1298 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1299 .addImm(0)
1300 .addReg(FalseReg)
1301 .addImm(0)
1302 .addReg(TrueReg)
1303 .addReg(SReg);
1304 llvm_unreachable("Unhandled branch predicate EXECZ");
1305 break;
1306 }
1307 default:
1308 llvm_unreachable("invalid branch predicate");
1309 }
1310 } else {
1311 llvm_unreachable("Can only handle Cond size 1 or 2");
1312 }
1313}
1314
1317 const DebugLoc &DL,
1318 Register SrcReg, int Value) const {
1320 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1321 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1322 .addImm(Value)
1323 .addReg(SrcReg);
1324
1325 return Reg;
1326}
1327
1330 const DebugLoc &DL,
1331 Register SrcReg, int Value) const {
1333 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1334 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1335 .addImm(Value)
1336 .addReg(SrcReg);
1337
1338 return Reg;
1339}
1340
1342 const Register Reg,
1343 int64_t &ImmVal) const {
1344 switch (MI.getOpcode()) {
1345 case AMDGPU::V_MOV_B32_e32:
1346 case AMDGPU::S_MOV_B32:
1347 case AMDGPU::S_MOVK_I32:
1348 case AMDGPU::S_MOV_B64:
1349 case AMDGPU::V_MOV_B64_e32:
1350 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1351 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1352 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1353 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1354 case AMDGPU::V_MOV_B64_PSEUDO: {
1355 const MachineOperand &Src0 = MI.getOperand(1);
1356 if (Src0.isImm()) {
1357 ImmVal = Src0.getImm();
1358 return MI.getOperand(0).getReg() == Reg;
1359 }
1360
1361 return false;
1362 }
1363 case AMDGPU::S_BREV_B32:
1364 case AMDGPU::V_BFREV_B32_e32:
1365 case AMDGPU::V_BFREV_B32_e64: {
1366 const MachineOperand &Src0 = MI.getOperand(1);
1367 if (Src0.isImm()) {
1368 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1369 return MI.getOperand(0).getReg() == Reg;
1370 }
1371
1372 return false;
1373 }
1374 case AMDGPU::S_NOT_B32:
1375 case AMDGPU::V_NOT_B32_e32:
1376 case AMDGPU::V_NOT_B32_e64: {
1377 const MachineOperand &Src0 = MI.getOperand(1);
1378 if (Src0.isImm()) {
1379 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1380 return MI.getOperand(0).getReg() == Reg;
1381 }
1382
1383 return false;
1384 }
1385 default:
1386 return false;
1387 }
1388}
1389
1391
1392 if (RI.isAGPRClass(DstRC))
1393 return AMDGPU::COPY;
1394 if (RI.getRegSizeInBits(*DstRC) == 16) {
1395 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1396 // before RA.
1397 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1398 }
1399 if (RI.getRegSizeInBits(*DstRC) == 32)
1400 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1401 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1402 return AMDGPU::S_MOV_B64;
1403 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1404 return AMDGPU::V_MOV_B64_PSEUDO;
1405 return AMDGPU::COPY;
1406}
1407
1408const MCInstrDesc &
1410 bool IsIndirectSrc) const {
1411 if (IsIndirectSrc) {
1412 if (VecSize <= 32) // 4 bytes
1413 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1414 if (VecSize <= 64) // 8 bytes
1415 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1416 if (VecSize <= 96) // 12 bytes
1417 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1418 if (VecSize <= 128) // 16 bytes
1419 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1420 if (VecSize <= 160) // 20 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1422 if (VecSize <= 256) // 32 bytes
1423 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1424 if (VecSize <= 288) // 36 bytes
1425 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1426 if (VecSize <= 320) // 40 bytes
1427 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1428 if (VecSize <= 352) // 44 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1430 if (VecSize <= 384) // 48 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1432 if (VecSize <= 512) // 64 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1434 if (VecSize <= 1024) // 128 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1436
1437 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1438 }
1439
1440 if (VecSize <= 32) // 4 bytes
1441 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1442 if (VecSize <= 64) // 8 bytes
1443 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1444 if (VecSize <= 96) // 12 bytes
1445 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1446 if (VecSize <= 128) // 16 bytes
1447 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1448 if (VecSize <= 160) // 20 bytes
1449 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1450 if (VecSize <= 256) // 32 bytes
1451 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1452 if (VecSize <= 288) // 36 bytes
1453 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1454 if (VecSize <= 320) // 40 bytes
1455 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1456 if (VecSize <= 352) // 44 bytes
1457 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1458 if (VecSize <= 384) // 48 bytes
1459 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1460 if (VecSize <= 512) // 64 bytes
1461 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1462 if (VecSize <= 1024) // 128 bytes
1463 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1464
1465 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1466}
1467
1468static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1469 if (VecSize <= 32) // 4 bytes
1470 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1471 if (VecSize <= 64) // 8 bytes
1472 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1473 if (VecSize <= 96) // 12 bytes
1474 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1475 if (VecSize <= 128) // 16 bytes
1476 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1477 if (VecSize <= 160) // 20 bytes
1478 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1479 if (VecSize <= 256) // 32 bytes
1480 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1481 if (VecSize <= 288) // 36 bytes
1482 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1483 if (VecSize <= 320) // 40 bytes
1484 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1485 if (VecSize <= 352) // 44 bytes
1486 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1487 if (VecSize <= 384) // 48 bytes
1488 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1489 if (VecSize <= 512) // 64 bytes
1490 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1491 if (VecSize <= 1024) // 128 bytes
1492 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1493
1494 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1495}
1496
1497static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1498 if (VecSize <= 32) // 4 bytes
1499 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1500 if (VecSize <= 64) // 8 bytes
1501 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1502 if (VecSize <= 96) // 12 bytes
1503 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1504 if (VecSize <= 128) // 16 bytes
1505 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1506 if (VecSize <= 160) // 20 bytes
1507 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1508 if (VecSize <= 256) // 32 bytes
1509 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1510 if (VecSize <= 288) // 36 bytes
1511 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1512 if (VecSize <= 320) // 40 bytes
1513 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1514 if (VecSize <= 352) // 44 bytes
1515 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1516 if (VecSize <= 384) // 48 bytes
1517 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1518 if (VecSize <= 512) // 64 bytes
1519 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1520 if (VecSize <= 1024) // 128 bytes
1521 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1522
1523 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1524}
1525
1526static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1527 if (VecSize <= 64) // 8 bytes
1528 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1529 if (VecSize <= 128) // 16 bytes
1530 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1531 if (VecSize <= 256) // 32 bytes
1532 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1533 if (VecSize <= 512) // 64 bytes
1534 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1535 if (VecSize <= 1024) // 128 bytes
1536 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1537
1538 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1539}
1540
1541const MCInstrDesc &
1542SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1543 bool IsSGPR) const {
1544 if (IsSGPR) {
1545 switch (EltSize) {
1546 case 32:
1547 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1548 case 64:
1549 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1550 default:
1551 llvm_unreachable("invalid reg indexing elt size");
1552 }
1553 }
1554
1555 assert(EltSize == 32 && "invalid reg indexing elt size");
1557}
1558
1559static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1560 switch (Size) {
1561 case 4:
1562 return AMDGPU::SI_SPILL_S32_SAVE;
1563 case 8:
1564 return AMDGPU::SI_SPILL_S64_SAVE;
1565 case 12:
1566 return AMDGPU::SI_SPILL_S96_SAVE;
1567 case 16:
1568 return AMDGPU::SI_SPILL_S128_SAVE;
1569 case 20:
1570 return AMDGPU::SI_SPILL_S160_SAVE;
1571 case 24:
1572 return AMDGPU::SI_SPILL_S192_SAVE;
1573 case 28:
1574 return AMDGPU::SI_SPILL_S224_SAVE;
1575 case 32:
1576 return AMDGPU::SI_SPILL_S256_SAVE;
1577 case 36:
1578 return AMDGPU::SI_SPILL_S288_SAVE;
1579 case 40:
1580 return AMDGPU::SI_SPILL_S320_SAVE;
1581 case 44:
1582 return AMDGPU::SI_SPILL_S352_SAVE;
1583 case 48:
1584 return AMDGPU::SI_SPILL_S384_SAVE;
1585 case 64:
1586 return AMDGPU::SI_SPILL_S512_SAVE;
1587 case 128:
1588 return AMDGPU::SI_SPILL_S1024_SAVE;
1589 default:
1590 llvm_unreachable("unknown register size");
1591 }
1592}
1593
1594static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1595 switch (Size) {
1596 case 2:
1597 return AMDGPU::SI_SPILL_V16_SAVE;
1598 case 4:
1599 return AMDGPU::SI_SPILL_V32_SAVE;
1600 case 8:
1601 return AMDGPU::SI_SPILL_V64_SAVE;
1602 case 12:
1603 return AMDGPU::SI_SPILL_V96_SAVE;
1604 case 16:
1605 return AMDGPU::SI_SPILL_V128_SAVE;
1606 case 20:
1607 return AMDGPU::SI_SPILL_V160_SAVE;
1608 case 24:
1609 return AMDGPU::SI_SPILL_V192_SAVE;
1610 case 28:
1611 return AMDGPU::SI_SPILL_V224_SAVE;
1612 case 32:
1613 return AMDGPU::SI_SPILL_V256_SAVE;
1614 case 36:
1615 return AMDGPU::SI_SPILL_V288_SAVE;
1616 case 40:
1617 return AMDGPU::SI_SPILL_V320_SAVE;
1618 case 44:
1619 return AMDGPU::SI_SPILL_V352_SAVE;
1620 case 48:
1621 return AMDGPU::SI_SPILL_V384_SAVE;
1622 case 64:
1623 return AMDGPU::SI_SPILL_V512_SAVE;
1624 case 128:
1625 return AMDGPU::SI_SPILL_V1024_SAVE;
1626 default:
1627 llvm_unreachable("unknown register size");
1628 }
1629}
1630
1631static unsigned getAVSpillSaveOpcode(unsigned Size) {
1632 switch (Size) {
1633 case 4:
1634 return AMDGPU::SI_SPILL_AV32_SAVE;
1635 case 8:
1636 return AMDGPU::SI_SPILL_AV64_SAVE;
1637 case 12:
1638 return AMDGPU::SI_SPILL_AV96_SAVE;
1639 case 16:
1640 return AMDGPU::SI_SPILL_AV128_SAVE;
1641 case 20:
1642 return AMDGPU::SI_SPILL_AV160_SAVE;
1643 case 24:
1644 return AMDGPU::SI_SPILL_AV192_SAVE;
1645 case 28:
1646 return AMDGPU::SI_SPILL_AV224_SAVE;
1647 case 32:
1648 return AMDGPU::SI_SPILL_AV256_SAVE;
1649 case 36:
1650 return AMDGPU::SI_SPILL_AV288_SAVE;
1651 case 40:
1652 return AMDGPU::SI_SPILL_AV320_SAVE;
1653 case 44:
1654 return AMDGPU::SI_SPILL_AV352_SAVE;
1655 case 48:
1656 return AMDGPU::SI_SPILL_AV384_SAVE;
1657 case 64:
1658 return AMDGPU::SI_SPILL_AV512_SAVE;
1659 case 128:
1660 return AMDGPU::SI_SPILL_AV1024_SAVE;
1661 default:
1662 llvm_unreachable("unknown register size");
1663 }
1664}
1665
1666static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1667 bool IsVectorSuperClass) {
1668 // Currently, there is only 32-bit WWM register spills needed.
1669 if (Size != 4)
1670 llvm_unreachable("unknown wwm register spill size");
1671
1672 if (IsVectorSuperClass)
1673 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1674
1675 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1676}
1677
1679 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1680 const SIMachineFunctionInfo &MFI) const {
1681 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1682
1683 // Choose the right opcode if spilling a WWM register.
1685 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1686
1687 // TODO: Check if AGPRs are available
1688 if (ST.hasMAIInsts())
1689 return getAVSpillSaveOpcode(Size);
1690
1692}
1693
1696 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1697 const TargetRegisterInfo *TRI, Register VReg,
1698 MachineInstr::MIFlag Flags) const {
1701 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1702 const DebugLoc &DL = MBB.findDebugLoc(MI);
1703
1704 MachinePointerInfo PtrInfo
1705 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1707 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1708 FrameInfo.getObjectAlign(FrameIndex));
1709 unsigned SpillSize = TRI->getSpillSize(*RC);
1710
1712 if (RI.isSGPRClass(RC)) {
1713 MFI->setHasSpilledSGPRs();
1714 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1715 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1716 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1717
1718 // We are only allowed to create one new instruction when spilling
1719 // registers, so we need to use pseudo instruction for spilling SGPRs.
1720 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1721
1722 // The SGPR spill/restore instructions only work on number sgprs, so we need
1723 // to make sure we are using the correct register class.
1724 if (SrcReg.isVirtual() && SpillSize == 4) {
1725 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1726 }
1727
1728 BuildMI(MBB, MI, DL, OpDesc)
1729 .addReg(SrcReg, getKillRegState(isKill)) // data
1730 .addFrameIndex(FrameIndex) // addr
1731 .addMemOperand(MMO)
1733
1734 if (RI.spillSGPRToVGPR())
1735 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1736 return;
1737 }
1738
1739 unsigned Opcode =
1740 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1741 MFI->setHasSpilledVGPRs();
1742
1743 BuildMI(MBB, MI, DL, get(Opcode))
1744 .addReg(SrcReg, getKillRegState(isKill)) // data
1745 .addFrameIndex(FrameIndex) // addr
1746 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1747 .addImm(0) // offset
1748 .addMemOperand(MMO);
1749}
1750
1751static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1752 switch (Size) {
1753 case 4:
1754 return AMDGPU::SI_SPILL_S32_RESTORE;
1755 case 8:
1756 return AMDGPU::SI_SPILL_S64_RESTORE;
1757 case 12:
1758 return AMDGPU::SI_SPILL_S96_RESTORE;
1759 case 16:
1760 return AMDGPU::SI_SPILL_S128_RESTORE;
1761 case 20:
1762 return AMDGPU::SI_SPILL_S160_RESTORE;
1763 case 24:
1764 return AMDGPU::SI_SPILL_S192_RESTORE;
1765 case 28:
1766 return AMDGPU::SI_SPILL_S224_RESTORE;
1767 case 32:
1768 return AMDGPU::SI_SPILL_S256_RESTORE;
1769 case 36:
1770 return AMDGPU::SI_SPILL_S288_RESTORE;
1771 case 40:
1772 return AMDGPU::SI_SPILL_S320_RESTORE;
1773 case 44:
1774 return AMDGPU::SI_SPILL_S352_RESTORE;
1775 case 48:
1776 return AMDGPU::SI_SPILL_S384_RESTORE;
1777 case 64:
1778 return AMDGPU::SI_SPILL_S512_RESTORE;
1779 case 128:
1780 return AMDGPU::SI_SPILL_S1024_RESTORE;
1781 default:
1782 llvm_unreachable("unknown register size");
1783 }
1784}
1785
1786static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1787 switch (Size) {
1788 case 2:
1789 return AMDGPU::SI_SPILL_V16_RESTORE;
1790 case 4:
1791 return AMDGPU::SI_SPILL_V32_RESTORE;
1792 case 8:
1793 return AMDGPU::SI_SPILL_V64_RESTORE;
1794 case 12:
1795 return AMDGPU::SI_SPILL_V96_RESTORE;
1796 case 16:
1797 return AMDGPU::SI_SPILL_V128_RESTORE;
1798 case 20:
1799 return AMDGPU::SI_SPILL_V160_RESTORE;
1800 case 24:
1801 return AMDGPU::SI_SPILL_V192_RESTORE;
1802 case 28:
1803 return AMDGPU::SI_SPILL_V224_RESTORE;
1804 case 32:
1805 return AMDGPU::SI_SPILL_V256_RESTORE;
1806 case 36:
1807 return AMDGPU::SI_SPILL_V288_RESTORE;
1808 case 40:
1809 return AMDGPU::SI_SPILL_V320_RESTORE;
1810 case 44:
1811 return AMDGPU::SI_SPILL_V352_RESTORE;
1812 case 48:
1813 return AMDGPU::SI_SPILL_V384_RESTORE;
1814 case 64:
1815 return AMDGPU::SI_SPILL_V512_RESTORE;
1816 case 128:
1817 return AMDGPU::SI_SPILL_V1024_RESTORE;
1818 default:
1819 llvm_unreachable("unknown register size");
1820 }
1821}
1822
1823static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1824 switch (Size) {
1825 case 4:
1826 return AMDGPU::SI_SPILL_AV32_RESTORE;
1827 case 8:
1828 return AMDGPU::SI_SPILL_AV64_RESTORE;
1829 case 12:
1830 return AMDGPU::SI_SPILL_AV96_RESTORE;
1831 case 16:
1832 return AMDGPU::SI_SPILL_AV128_RESTORE;
1833 case 20:
1834 return AMDGPU::SI_SPILL_AV160_RESTORE;
1835 case 24:
1836 return AMDGPU::SI_SPILL_AV192_RESTORE;
1837 case 28:
1838 return AMDGPU::SI_SPILL_AV224_RESTORE;
1839 case 32:
1840 return AMDGPU::SI_SPILL_AV256_RESTORE;
1841 case 36:
1842 return AMDGPU::SI_SPILL_AV288_RESTORE;
1843 case 40:
1844 return AMDGPU::SI_SPILL_AV320_RESTORE;
1845 case 44:
1846 return AMDGPU::SI_SPILL_AV352_RESTORE;
1847 case 48:
1848 return AMDGPU::SI_SPILL_AV384_RESTORE;
1849 case 64:
1850 return AMDGPU::SI_SPILL_AV512_RESTORE;
1851 case 128:
1852 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1853 default:
1854 llvm_unreachable("unknown register size");
1855 }
1856}
1857
1858static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1859 bool IsVectorSuperClass) {
1860 // Currently, there is only 32-bit WWM register spills needed.
1861 if (Size != 4)
1862 llvm_unreachable("unknown wwm register spill size");
1863
1864 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1865 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1866
1867 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1868}
1869
1871 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1872 const SIMachineFunctionInfo &MFI) const {
1873 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1874
1875 // Choose the right opcode if restoring a WWM register.
1877 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1878
1879 // TODO: Check if AGPRs are available
1880 if (ST.hasMAIInsts())
1882
1883 assert(!RI.isAGPRClass(RC));
1885}
1886
1889 Register DestReg, int FrameIndex,
1890 const TargetRegisterClass *RC,
1891 const TargetRegisterInfo *TRI,
1892 Register VReg,
1893 MachineInstr::MIFlag Flags) const {
1896 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1897 const DebugLoc &DL = MBB.findDebugLoc(MI);
1898 unsigned SpillSize = TRI->getSpillSize(*RC);
1899
1900 MachinePointerInfo PtrInfo
1901 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1902
1904 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1905 FrameInfo.getObjectAlign(FrameIndex));
1906
1907 if (RI.isSGPRClass(RC)) {
1908 MFI->setHasSpilledSGPRs();
1909 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1910 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1911 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1912
1913 // FIXME: Maybe this should not include a memoperand because it will be
1914 // lowered to non-memory instructions.
1915 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1916 if (DestReg.isVirtual() && SpillSize == 4) {
1918 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1919 }
1920
1921 if (RI.spillSGPRToVGPR())
1922 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1923 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1924 .addFrameIndex(FrameIndex) // addr
1925 .addMemOperand(MMO)
1927
1928 return;
1929 }
1930
1931 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1932 SpillSize, *MFI);
1933 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1934 .addFrameIndex(FrameIndex) // vaddr
1935 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1936 .addImm(0) // offset
1937 .addMemOperand(MMO);
1938}
1939
1942 insertNoops(MBB, MI, 1);
1943}
1944
1947 unsigned Quantity) const {
1949 while (Quantity > 0) {
1950 unsigned Arg = std::min(Quantity, 8u);
1951 Quantity -= Arg;
1952 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1953 }
1954}
1955
1957 auto *MF = MBB.getParent();
1959
1960 assert(Info->isEntryFunction());
1961
1962 if (MBB.succ_empty()) {
1963 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1964 if (HasNoTerminator) {
1965 if (Info->returnsVoid()) {
1966 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1967 } else {
1968 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1969 }
1970 }
1971 }
1972}
1973
1977 const DebugLoc &DL) const {
1979 constexpr unsigned DoorbellIDMask = 0x3ff;
1980 constexpr unsigned ECQueueWaveAbort = 0x400;
1981
1982 MachineBasicBlock *TrapBB = &MBB;
1983 MachineBasicBlock *ContBB = &MBB;
1984 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1985
1986 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1987 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1988 TrapBB = MF->CreateMachineBasicBlock();
1989 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1990 MF->push_back(TrapBB);
1991 MBB.addSuccessor(TrapBB);
1992 }
1993
1994 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1995 // will be a nop.
1996 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1997 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1998 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1999 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2000 DoorbellReg)
2002 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2003 .addUse(AMDGPU::M0);
2004 Register DoorbellRegMasked =
2005 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2006 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2007 .addUse(DoorbellReg)
2008 .addImm(DoorbellIDMask);
2009 Register SetWaveAbortBit =
2010 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2011 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2012 .addUse(DoorbellRegMasked)
2013 .addImm(ECQueueWaveAbort);
2014 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2015 .addUse(SetWaveAbortBit);
2016 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2018 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2019 .addUse(AMDGPU::TTMP2);
2020 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2021 TrapBB->addSuccessor(HaltLoopBB);
2022
2023 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2024 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2025 .addMBB(HaltLoopBB);
2026 MF->push_back(HaltLoopBB);
2027 HaltLoopBB->addSuccessor(HaltLoopBB);
2028
2029 return ContBB;
2030}
2031
2033 switch (MI.getOpcode()) {
2034 default:
2035 if (MI.isMetaInstruction())
2036 return 0;
2037 return 1; // FIXME: Do wait states equal cycles?
2038
2039 case AMDGPU::S_NOP:
2040 return MI.getOperand(0).getImm() + 1;
2041 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2042 // hazard, even if one exist, won't really be visible. Should we handle it?
2043 }
2044}
2045
2047 MachineBasicBlock &MBB = *MI.getParent();
2049 switch (MI.getOpcode()) {
2050 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2051 case AMDGPU::S_MOV_B64_term:
2052 // This is only a terminator to get the correct spill code placement during
2053 // register allocation.
2054 MI.setDesc(get(AMDGPU::S_MOV_B64));
2055 break;
2056
2057 case AMDGPU::S_MOV_B32_term:
2058 // This is only a terminator to get the correct spill code placement during
2059 // register allocation.
2060 MI.setDesc(get(AMDGPU::S_MOV_B32));
2061 break;
2062
2063 case AMDGPU::S_XOR_B64_term:
2064 // This is only a terminator to get the correct spill code placement during
2065 // register allocation.
2066 MI.setDesc(get(AMDGPU::S_XOR_B64));
2067 break;
2068
2069 case AMDGPU::S_XOR_B32_term:
2070 // This is only a terminator to get the correct spill code placement during
2071 // register allocation.
2072 MI.setDesc(get(AMDGPU::S_XOR_B32));
2073 break;
2074 case AMDGPU::S_OR_B64_term:
2075 // This is only a terminator to get the correct spill code placement during
2076 // register allocation.
2077 MI.setDesc(get(AMDGPU::S_OR_B64));
2078 break;
2079 case AMDGPU::S_OR_B32_term:
2080 // This is only a terminator to get the correct spill code placement during
2081 // register allocation.
2082 MI.setDesc(get(AMDGPU::S_OR_B32));
2083 break;
2084
2085 case AMDGPU::S_ANDN2_B64_term:
2086 // This is only a terminator to get the correct spill code placement during
2087 // register allocation.
2088 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2089 break;
2090
2091 case AMDGPU::S_ANDN2_B32_term:
2092 // This is only a terminator to get the correct spill code placement during
2093 // register allocation.
2094 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2095 break;
2096
2097 case AMDGPU::S_AND_B64_term:
2098 // This is only a terminator to get the correct spill code placement during
2099 // register allocation.
2100 MI.setDesc(get(AMDGPU::S_AND_B64));
2101 break;
2102
2103 case AMDGPU::S_AND_B32_term:
2104 // This is only a terminator to get the correct spill code placement during
2105 // register allocation.
2106 MI.setDesc(get(AMDGPU::S_AND_B32));
2107 break;
2108
2109 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2110 // This is only a terminator to get the correct spill code placement during
2111 // register allocation.
2112 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2113 break;
2114
2115 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2116 // This is only a terminator to get the correct spill code placement during
2117 // register allocation.
2118 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2119 break;
2120
2121 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2122 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2123 break;
2124
2125 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2126 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2127 MI.getMF()->getRegInfo().constrainRegClass(MI.getOperand(0).getReg(),
2128 &AMDGPU::SReg_32_XM0RegClass);
2129 break;
2130 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2131 Register Dst = MI.getOperand(0).getReg();
2132 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2133 MI.setDesc(
2134 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2135 break;
2136 }
2137 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2138 Register Dst = MI.getOperand(0).getReg();
2139 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2140 int64_t Imm = MI.getOperand(1).getImm();
2141
2142 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2143 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2144 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2145 .addImm(SignExtend64<32>(Imm))
2147 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2148 .addImm(SignExtend64<32>(Imm >> 32))
2150 MI.eraseFromParent();
2151 break;
2152 }
2153
2154 [[fallthrough]];
2155 }
2156 case AMDGPU::V_MOV_B64_PSEUDO: {
2157 Register Dst = MI.getOperand(0).getReg();
2158 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2159 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2160
2161 const MachineOperand &SrcOp = MI.getOperand(1);
2162 // FIXME: Will this work for 64-bit floating point immediates?
2163 assert(!SrcOp.isFPImm());
2164 if (ST.hasMovB64()) {
2165 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2166 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2167 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2168 break;
2169 }
2170 if (SrcOp.isImm()) {
2171 APInt Imm(64, SrcOp.getImm());
2172 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2173 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2174 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2175 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2177 .addImm(Lo.getSExtValue())
2179 .addImm(Lo.getSExtValue())
2180 .addImm(0) // op_sel_lo
2181 .addImm(0) // op_sel_hi
2182 .addImm(0) // neg_lo
2183 .addImm(0) // neg_hi
2184 .addImm(0); // clamp
2185 } else {
2186 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2187 .addImm(Lo.getSExtValue())
2189 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2190 .addImm(Hi.getSExtValue())
2192 }
2193 } else {
2194 assert(SrcOp.isReg());
2195 if (ST.hasPkMovB32() &&
2196 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2197 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2198 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2199 .addReg(SrcOp.getReg())
2201 .addReg(SrcOp.getReg())
2202 .addImm(0) // op_sel_lo
2203 .addImm(0) // op_sel_hi
2204 .addImm(0) // neg_lo
2205 .addImm(0) // neg_hi
2206 .addImm(0); // clamp
2207 } else {
2208 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2209 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2211 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2212 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2214 }
2215 }
2216 MI.eraseFromParent();
2217 break;
2218 }
2219 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2221 break;
2222 }
2223 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2224 const MachineOperand &SrcOp = MI.getOperand(1);
2225 assert(!SrcOp.isFPImm());
2226
2227 if (ST.has64BitLiterals()) {
2228 MI.setDesc(get(AMDGPU::S_MOV_B64));
2229 break;
2230 }
2231
2232 APInt Imm(64, SrcOp.getImm());
2233 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2234 MI.setDesc(get(AMDGPU::S_MOV_B64));
2235 break;
2236 }
2237
2238 Register Dst = MI.getOperand(0).getReg();
2239 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2240 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2241
2242 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2243 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2244 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2245 .addImm(Lo.getSExtValue())
2247 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2248 .addImm(Hi.getSExtValue())
2250 MI.eraseFromParent();
2251 break;
2252 }
2253 case AMDGPU::V_SET_INACTIVE_B32: {
2254 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2255 Register DstReg = MI.getOperand(0).getReg();
2256 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2257 .add(MI.getOperand(3))
2258 .add(MI.getOperand(4))
2259 .add(MI.getOperand(1))
2260 .add(MI.getOperand(2))
2261 .add(MI.getOperand(5));
2262 MI.eraseFromParent();
2263 break;
2264 }
2265 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2266 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2267 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2268 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2269 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2270 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2271 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2272 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2273 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2274 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2275 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2276 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2277 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2278 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2279 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2280 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2281 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2282 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2283 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2284 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2285 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2286 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2287 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2288 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2289 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2290 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2291 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2292 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2293 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2294 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2295
2296 unsigned Opc;
2297 if (RI.hasVGPRs(EltRC)) {
2298 Opc = AMDGPU::V_MOVRELD_B32_e32;
2299 } else {
2300 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2301 : AMDGPU::S_MOVRELD_B32;
2302 }
2303
2304 const MCInstrDesc &OpDesc = get(Opc);
2305 Register VecReg = MI.getOperand(0).getReg();
2306 bool IsUndef = MI.getOperand(1).isUndef();
2307 unsigned SubReg = MI.getOperand(3).getImm();
2308 assert(VecReg == MI.getOperand(1).getReg());
2309
2311 BuildMI(MBB, MI, DL, OpDesc)
2312 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2313 .add(MI.getOperand(2))
2315 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2316
2317 const int ImpDefIdx =
2318 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2319 const int ImpUseIdx = ImpDefIdx + 1;
2320 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2321 MI.eraseFromParent();
2322 break;
2323 }
2324 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2325 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2326 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2327 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2328 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2329 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2330 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2331 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2332 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2333 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2334 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2335 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2337 Register VecReg = MI.getOperand(0).getReg();
2338 bool IsUndef = MI.getOperand(1).isUndef();
2339 MachineOperand &Idx = MI.getOperand(3);
2340 Register SubReg = MI.getOperand(4).getImm();
2341
2342 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2343 .add(Idx)
2345 SetOn->getOperand(3).setIsUndef();
2346
2347 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2349 BuildMI(MBB, MI, DL, OpDesc)
2350 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2351 .add(MI.getOperand(2))
2353 .addReg(VecReg,
2354 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2355
2356 const int ImpDefIdx =
2357 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2358 const int ImpUseIdx = ImpDefIdx + 1;
2359 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2360
2361 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2362
2363 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2364
2365 MI.eraseFromParent();
2366 break;
2367 }
2368 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2369 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2370 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2371 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2372 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2373 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2374 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2375 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2376 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2377 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2378 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2379 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2381 Register Dst = MI.getOperand(0).getReg();
2382 Register VecReg = MI.getOperand(1).getReg();
2383 bool IsUndef = MI.getOperand(1).isUndef();
2384 Register Idx = MI.getOperand(2).getReg();
2385 Register SubReg = MI.getOperand(3).getImm();
2386
2387 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2388 .addReg(Idx)
2390 SetOn->getOperand(3).setIsUndef();
2391
2392 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2393 .addDef(Dst)
2394 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2395 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2396
2397 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2398
2399 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2400
2401 MI.eraseFromParent();
2402 break;
2403 }
2404 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2405 MachineFunction &MF = *MBB.getParent();
2406 Register Reg = MI.getOperand(0).getReg();
2407 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2408 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2409 MachineOperand OpLo = MI.getOperand(1);
2410 MachineOperand OpHi = MI.getOperand(2);
2411
2412 // Create a bundle so these instructions won't be re-ordered by the
2413 // post-RA scheduler.
2414 MIBundleBuilder Bundler(MBB, MI);
2415 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2416
2417 // What we want here is an offset from the value returned by s_getpc (which
2418 // is the address of the s_add_u32 instruction) to the global variable, but
2419 // since the encoding of $symbol starts 4 bytes after the start of the
2420 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2421 // small. This requires us to add 4 to the global variable offset in order
2422 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2423 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2424 // instruction.
2425
2426 int64_t Adjust = 0;
2427 if (ST.hasGetPCZeroExtension()) {
2428 // Fix up hardware that does not sign-extend the 48-bit PC value by
2429 // inserting: s_sext_i32_i16 reghi, reghi
2430 Bundler.append(
2431 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2432 Adjust += 4;
2433 }
2434
2435 if (OpLo.isGlobal())
2436 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2437 Bundler.append(
2438 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2439
2440 if (OpHi.isGlobal())
2441 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2442 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2443 .addReg(RegHi)
2444 .add(OpHi));
2445
2446 finalizeBundle(MBB, Bundler.begin());
2447
2448 MI.eraseFromParent();
2449 break;
2450 }
2451 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2452 MachineFunction &MF = *MBB.getParent();
2453 Register Reg = MI.getOperand(0).getReg();
2454 MachineOperand Op = MI.getOperand(1);
2455
2456 // Create a bundle so these instructions won't be re-ordered by the
2457 // post-RA scheduler.
2458 MIBundleBuilder Bundler(MBB, MI);
2459 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2460 if (Op.isGlobal())
2461 Op.setOffset(Op.getOffset() + 4);
2462 Bundler.append(
2463 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2464
2465 finalizeBundle(MBB, Bundler.begin());
2466
2467 MI.eraseFromParent();
2468 break;
2469 }
2470 case AMDGPU::ENTER_STRICT_WWM: {
2471 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2472 // Whole Wave Mode is entered.
2473 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2474 : AMDGPU::S_OR_SAVEEXEC_B64));
2475 break;
2476 }
2477 case AMDGPU::ENTER_STRICT_WQM: {
2478 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2479 // STRICT_WQM is entered.
2480 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2481 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2482 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2483 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2484 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2485
2486 MI.eraseFromParent();
2487 break;
2488 }
2489 case AMDGPU::EXIT_STRICT_WWM:
2490 case AMDGPU::EXIT_STRICT_WQM: {
2491 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2492 // WWM/STICT_WQM is exited.
2493 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2494 break;
2495 }
2496 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2497 case AMDGPU::SI_RETURN: {
2498 const MachineFunction *MF = MBB.getParent();
2499 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2500 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2501 // Hiding the return address use with SI_RETURN may lead to extra kills in
2502 // the function and missing live-ins. We are fine in practice because callee
2503 // saved register handling ensures the register value is restored before
2504 // RET, but we need the undef flag here to appease the MachineVerifier
2505 // liveness checks.
2507 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2508 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2509
2510 MIB.copyImplicitOps(MI);
2511 MI.eraseFromParent();
2512 break;
2513 }
2514
2515 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2516 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2517 MI.setDesc(get(AMDGPU::S_MUL_U64));
2518 break;
2519
2520 case AMDGPU::S_GETPC_B64_pseudo:
2521 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2522 if (ST.hasGetPCZeroExtension()) {
2523 Register Dst = MI.getOperand(0).getReg();
2524 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2525 // Fix up hardware that does not sign-extend the 48-bit PC value by
2526 // inserting: s_sext_i32_i16 dsthi, dsthi
2527 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2528 DstHi)
2529 .addReg(DstHi);
2530 }
2531 break;
2532
2533 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2535 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2536 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2537 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2538 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2539 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2540 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2541 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2542 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2543 break;
2544 }
2545
2546 return true;
2547}
2548
2551 unsigned SubIdx, const MachineInstr &Orig,
2552 const TargetRegisterInfo &RI) const {
2553
2554 // Try shrinking the instruction to remat only the part needed for current
2555 // context.
2556 // TODO: Handle more cases.
2557 unsigned Opcode = Orig.getOpcode();
2558 switch (Opcode) {
2559 case AMDGPU::S_LOAD_DWORDX16_IMM:
2560 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2561 if (SubIdx != 0)
2562 break;
2563
2564 if (I == MBB.end())
2565 break;
2566
2567 if (I->isBundled())
2568 break;
2569
2570 // Look for a single use of the register that is also a subreg.
2571 Register RegToFind = Orig.getOperand(0).getReg();
2572 MachineOperand *UseMO = nullptr;
2573 for (auto &CandMO : I->operands()) {
2574 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2575 continue;
2576 if (UseMO) {
2577 UseMO = nullptr;
2578 break;
2579 }
2580 UseMO = &CandMO;
2581 }
2582 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2583 break;
2584
2585 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2586 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2587
2590 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2591
2592 unsigned NewOpcode = -1;
2593 if (SubregSize == 256)
2594 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2595 else if (SubregSize == 128)
2596 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2597 else
2598 break;
2599
2600 const MCInstrDesc &TID = get(NewOpcode);
2601 const TargetRegisterClass *NewRC =
2602 RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2603 MRI.setRegClass(DestReg, NewRC);
2604
2605 UseMO->setReg(DestReg);
2606 UseMO->setSubReg(AMDGPU::NoSubRegister);
2607
2608 // Use a smaller load with the desired size, possibly with updated offset.
2609 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2610 MI->setDesc(TID);
2611 MI->getOperand(0).setReg(DestReg);
2612 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2613 if (Offset) {
2614 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2615 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2616 OffsetMO->setImm(FinalOffset);
2617 }
2619 for (const MachineMemOperand *MemOp : Orig.memoperands())
2620 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2621 SubregSize / 8));
2622 MI->setMemRefs(*MF, NewMMOs);
2623
2624 MBB.insert(I, MI);
2625 return;
2626 }
2627
2628 default:
2629 break;
2630 }
2631
2632 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2633}
2634
2635std::pair<MachineInstr*, MachineInstr*>
2637 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2638
2639 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2641 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2642 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2643 return std::pair(&MI, nullptr);
2644 }
2645
2646 MachineBasicBlock &MBB = *MI.getParent();
2650 Register Dst = MI.getOperand(0).getReg();
2651 unsigned Part = 0;
2652 MachineInstr *Split[2];
2653
2654 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2655 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2656 if (Dst.isPhysical()) {
2657 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2658 } else {
2659 assert(MRI.isSSA());
2660 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2661 MovDPP.addDef(Tmp);
2662 }
2663
2664 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2665 const MachineOperand &SrcOp = MI.getOperand(I);
2666 assert(!SrcOp.isFPImm());
2667 if (SrcOp.isImm()) {
2668 APInt Imm(64, SrcOp.getImm());
2669 Imm.ashrInPlace(Part * 32);
2670 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2671 } else {
2672 assert(SrcOp.isReg());
2673 Register Src = SrcOp.getReg();
2674 if (Src.isPhysical())
2675 MovDPP.addReg(RI.getSubReg(Src, Sub));
2676 else
2677 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2678 }
2679 }
2680
2681 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2682 MovDPP.addImm(MO.getImm());
2683
2684 Split[Part] = MovDPP;
2685 ++Part;
2686 }
2687
2688 if (Dst.isVirtual())
2689 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2690 .addReg(Split[0]->getOperand(0).getReg())
2691 .addImm(AMDGPU::sub0)
2692 .addReg(Split[1]->getOperand(0).getReg())
2693 .addImm(AMDGPU::sub1);
2694
2695 MI.eraseFromParent();
2696 return std::pair(Split[0], Split[1]);
2697}
2698
2699std::optional<DestSourcePair>
2701 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2702 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2703
2704 return std::nullopt;
2705}
2706
2708 AMDGPU::OpName Src0OpName,
2709 MachineOperand &Src1,
2710 AMDGPU::OpName Src1OpName) const {
2711 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2712 if (!Src0Mods)
2713 return false;
2714
2715 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2716 assert(Src1Mods &&
2717 "All commutable instructions have both src0 and src1 modifiers");
2718
2719 int Src0ModsVal = Src0Mods->getImm();
2720 int Src1ModsVal = Src1Mods->getImm();
2721
2722 Src1Mods->setImm(Src0ModsVal);
2723 Src0Mods->setImm(Src1ModsVal);
2724 return true;
2725}
2726
2728 MachineOperand &RegOp,
2729 MachineOperand &NonRegOp) {
2730 Register Reg = RegOp.getReg();
2731 unsigned SubReg = RegOp.getSubReg();
2732 bool IsKill = RegOp.isKill();
2733 bool IsDead = RegOp.isDead();
2734 bool IsUndef = RegOp.isUndef();
2735 bool IsDebug = RegOp.isDebug();
2736
2737 if (NonRegOp.isImm())
2738 RegOp.ChangeToImmediate(NonRegOp.getImm());
2739 else if (NonRegOp.isFI())
2740 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2741 else if (NonRegOp.isGlobal()) {
2742 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2743 NonRegOp.getTargetFlags());
2744 } else
2745 return nullptr;
2746
2747 // Make sure we don't reinterpret a subreg index in the target flags.
2748 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2749
2750 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2751 NonRegOp.setSubReg(SubReg);
2752
2753 return &MI;
2754}
2755
2757 MachineOperand &NonRegOp1,
2758 MachineOperand &NonRegOp2) {
2759 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2760 int64_t NonRegVal = NonRegOp1.getImm();
2761
2762 NonRegOp1.setImm(NonRegOp2.getImm());
2763 NonRegOp2.setImm(NonRegVal);
2764 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2765 NonRegOp2.setTargetFlags(TargetFlags);
2766 return &MI;
2767}
2768
2769bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2770 unsigned OpIdx1) const {
2771 const MCInstrDesc &InstDesc = MI.getDesc();
2772 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2773 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2774
2775 unsigned Opc = MI.getOpcode();
2776 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2777
2778 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2779 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2780
2781 // Swap doesn't breach constant bus or literal limits
2782 // It may move literal to position other than src0, this is not allowed
2783 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2784 // FIXME: After gfx9, literal can be in place other than Src0
2785 if (isVALU(MI)) {
2786 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2787 !isInlineConstant(MO0, OpInfo1))
2788 return false;
2789 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2790 !isInlineConstant(MO1, OpInfo0))
2791 return false;
2792 }
2793
2794 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2795 if (OpInfo1.RegClass == -1)
2796 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2797 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2798 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2799 }
2800 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2801 if (OpInfo0.RegClass == -1)
2802 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2803 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2804 isLegalRegOperand(MI, OpIdx0, MO1);
2805 }
2806
2807 // No need to check 64-bit literals since swapping does not bring new
2808 // 64-bit literals into current instruction to fold to 32-bit
2809
2810 return isImmOperandLegal(MI, OpIdx1, MO0);
2811}
2812
2814 unsigned Src0Idx,
2815 unsigned Src1Idx) const {
2816 assert(!NewMI && "this should never be used");
2817
2818 unsigned Opc = MI.getOpcode();
2819 int CommutedOpcode = commuteOpcode(Opc);
2820 if (CommutedOpcode == -1)
2821 return nullptr;
2822
2823 if (Src0Idx > Src1Idx)
2824 std::swap(Src0Idx, Src1Idx);
2825
2826 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2827 static_cast<int>(Src0Idx) &&
2828 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2829 static_cast<int>(Src1Idx) &&
2830 "inconsistency with findCommutedOpIndices");
2831
2832 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2833 return nullptr;
2834
2835 MachineInstr *CommutedMI = nullptr;
2836 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2837 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2838 if (Src0.isReg() && Src1.isReg()) {
2839 // Be sure to copy the source modifiers to the right place.
2840 CommutedMI =
2841 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2842 } else if (Src0.isReg() && !Src1.isReg()) {
2843 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2844 } else if (!Src0.isReg() && Src1.isReg()) {
2845 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2846 } else if (Src0.isImm() && Src1.isImm()) {
2847 CommutedMI = swapImmOperands(MI, Src0, Src1);
2848 } else {
2849 // FIXME: Found two non registers to commute. This does happen.
2850 return nullptr;
2851 }
2852
2853 if (CommutedMI) {
2854 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2855 Src1, AMDGPU::OpName::src1_modifiers);
2856
2857 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2858 AMDGPU::OpName::src1_sel);
2859
2860 CommutedMI->setDesc(get(CommutedOpcode));
2861 }
2862
2863 return CommutedMI;
2864}
2865
2866// This needs to be implemented because the source modifiers may be inserted
2867// between the true commutable operands, and the base
2868// TargetInstrInfo::commuteInstruction uses it.
2870 unsigned &SrcOpIdx0,
2871 unsigned &SrcOpIdx1) const {
2872 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2873}
2874
2876 unsigned &SrcOpIdx0,
2877 unsigned &SrcOpIdx1) const {
2878 if (!Desc.isCommutable())
2879 return false;
2880
2881 unsigned Opc = Desc.getOpcode();
2882 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2883 if (Src0Idx == -1)
2884 return false;
2885
2886 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2887 if (Src1Idx == -1)
2888 return false;
2889
2890 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2891}
2892
2894 int64_t BrOffset) const {
2895 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2896 // because its dest block is unanalyzable.
2897 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2898
2899 // Convert to dwords.
2900 BrOffset /= 4;
2901
2902 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2903 // from the next instruction.
2904 BrOffset -= 1;
2905
2906 return isIntN(BranchOffsetBits, BrOffset);
2907}
2908
2911 return MI.getOperand(0).getMBB();
2912}
2913
2915 for (const MachineInstr &MI : MBB->terminators()) {
2916 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2917 MI.getOpcode() == AMDGPU::SI_LOOP)
2918 return true;
2919 }
2920 return false;
2921}
2922
2924 MachineBasicBlock &DestBB,
2925 MachineBasicBlock &RestoreBB,
2926 const DebugLoc &DL, int64_t BrOffset,
2927 RegScavenger *RS) const {
2928 assert(MBB.empty() &&
2929 "new block should be inserted for expanding unconditional branch");
2930 assert(MBB.pred_size() == 1);
2931 assert(RestoreBB.empty() &&
2932 "restore block should be inserted for restoring clobbered registers");
2933
2937 auto I = MBB.end();
2938 auto &MCCtx = MF->getContext();
2939
2940 if (ST.hasAddPC64Inst()) {
2941 MCSymbol *Offset =
2942 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2943 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2945 MCSymbol *PostAddPCLabel =
2946 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2947 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2948 auto *OffsetExpr = MCBinaryExpr::createSub(
2949 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2950 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2951 Offset->setVariableValue(OffsetExpr);
2952 return;
2953 }
2954
2955 assert(RS && "RegScavenger required for long branching");
2956
2957 // FIXME: Virtual register workaround for RegScavenger not working with empty
2958 // blocks.
2959 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2960
2961 // Note: as this is used after hazard recognizer we need to apply some hazard
2962 // workarounds directly.
2963 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2965 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2966 if (FlushSGPRWrites)
2967 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2969 };
2970
2971 // We need to compute the offset relative to the instruction immediately after
2972 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2973 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2974 ApplyHazardWorkarounds();
2975
2976 MCSymbol *PostGetPCLabel =
2977 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2978 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2979
2980 MCSymbol *OffsetLo =
2981 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2982 MCSymbol *OffsetHi =
2983 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2984 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2985 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2986 .addReg(PCReg, 0, AMDGPU::sub0)
2987 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2988 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2989 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2990 .addReg(PCReg, 0, AMDGPU::sub1)
2991 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2992 ApplyHazardWorkarounds();
2993
2994 // Insert the indirect branch after the other terminator.
2995 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2996 .addReg(PCReg);
2997
2998 // If a spill is needed for the pc register pair, we need to insert a spill
2999 // restore block right before the destination block, and insert a short branch
3000 // into the old destination block's fallthrough predecessor.
3001 // e.g.:
3002 //
3003 // s_cbranch_scc0 skip_long_branch:
3004 //
3005 // long_branch_bb:
3006 // spill s[8:9]
3007 // s_getpc_b64 s[8:9]
3008 // s_add_u32 s8, s8, restore_bb
3009 // s_addc_u32 s9, s9, 0
3010 // s_setpc_b64 s[8:9]
3011 //
3012 // skip_long_branch:
3013 // foo;
3014 //
3015 // .....
3016 //
3017 // dest_bb_fallthrough_predecessor:
3018 // bar;
3019 // s_branch dest_bb
3020 //
3021 // restore_bb:
3022 // restore s[8:9]
3023 // fallthrough dest_bb
3024 ///
3025 // dest_bb:
3026 // buzz;
3027
3028 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3029 Register Scav;
3030
3031 // If we've previously reserved a register for long branches
3032 // avoid running the scavenger and just use those registers
3033 if (LongBranchReservedReg) {
3034 RS->enterBasicBlock(MBB);
3035 Scav = LongBranchReservedReg;
3036 } else {
3038 Scav = RS->scavengeRegisterBackwards(
3039 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3040 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3041 }
3042 if (Scav) {
3043 RS->setRegUsed(Scav);
3044 MRI.replaceRegWith(PCReg, Scav);
3045 MRI.clearVirtRegs();
3046 } else {
3047 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3048 // SGPR spill.
3049 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3050 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3051 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3052 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3053 MRI.clearVirtRegs();
3054 }
3055
3056 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3057 // Now, the distance could be defined.
3059 MCSymbolRefExpr::create(DestLabel, MCCtx),
3060 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3061 // Add offset assignments.
3062 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3063 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3064 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3065 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3066}
3067
3068unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3069 switch (Cond) {
3070 case SIInstrInfo::SCC_TRUE:
3071 return AMDGPU::S_CBRANCH_SCC1;
3072 case SIInstrInfo::SCC_FALSE:
3073 return AMDGPU::S_CBRANCH_SCC0;
3074 case SIInstrInfo::VCCNZ:
3075 return AMDGPU::S_CBRANCH_VCCNZ;
3076 case SIInstrInfo::VCCZ:
3077 return AMDGPU::S_CBRANCH_VCCZ;
3078 case SIInstrInfo::EXECNZ:
3079 return AMDGPU::S_CBRANCH_EXECNZ;
3080 case SIInstrInfo::EXECZ:
3081 return AMDGPU::S_CBRANCH_EXECZ;
3082 default:
3083 llvm_unreachable("invalid branch predicate");
3084 }
3085}
3086
3087SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3088 switch (Opcode) {
3089 case AMDGPU::S_CBRANCH_SCC0:
3090 return SCC_FALSE;
3091 case AMDGPU::S_CBRANCH_SCC1:
3092 return SCC_TRUE;
3093 case AMDGPU::S_CBRANCH_VCCNZ:
3094 return VCCNZ;
3095 case AMDGPU::S_CBRANCH_VCCZ:
3096 return VCCZ;
3097 case AMDGPU::S_CBRANCH_EXECNZ:
3098 return EXECNZ;
3099 case AMDGPU::S_CBRANCH_EXECZ:
3100 return EXECZ;
3101 default:
3102 return INVALID_BR;
3103 }
3104}
3105
3109 MachineBasicBlock *&FBB,
3111 bool AllowModify) const {
3112 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3113 // Unconditional Branch
3114 TBB = I->getOperand(0).getMBB();
3115 return false;
3116 }
3117
3118 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3119 if (Pred == INVALID_BR)
3120 return true;
3121
3122 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3123 Cond.push_back(MachineOperand::CreateImm(Pred));
3124 Cond.push_back(I->getOperand(1)); // Save the branch register.
3125
3126 ++I;
3127
3128 if (I == MBB.end()) {
3129 // Conditional branch followed by fall-through.
3130 TBB = CondBB;
3131 return false;
3132 }
3133
3134 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3135 TBB = CondBB;
3136 FBB = I->getOperand(0).getMBB();
3137 return false;
3138 }
3139
3140 return true;
3141}
3142
3144 MachineBasicBlock *&FBB,
3146 bool AllowModify) const {
3148 auto E = MBB.end();
3149 if (I == E)
3150 return false;
3151
3152 // Skip over the instructions that are artificially terminators for special
3153 // exec management.
3154 while (I != E && !I->isBranch() && !I->isReturn()) {
3155 switch (I->getOpcode()) {
3156 case AMDGPU::S_MOV_B64_term:
3157 case AMDGPU::S_XOR_B64_term:
3158 case AMDGPU::S_OR_B64_term:
3159 case AMDGPU::S_ANDN2_B64_term:
3160 case AMDGPU::S_AND_B64_term:
3161 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3162 case AMDGPU::S_MOV_B32_term:
3163 case AMDGPU::S_XOR_B32_term:
3164 case AMDGPU::S_OR_B32_term:
3165 case AMDGPU::S_ANDN2_B32_term:
3166 case AMDGPU::S_AND_B32_term:
3167 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3168 break;
3169 case AMDGPU::SI_IF:
3170 case AMDGPU::SI_ELSE:
3171 case AMDGPU::SI_KILL_I1_TERMINATOR:
3172 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3173 // FIXME: It's messy that these need to be considered here at all.
3174 return true;
3175 default:
3176 llvm_unreachable("unexpected non-branch terminator inst");
3177 }
3178
3179 ++I;
3180 }
3181
3182 if (I == E)
3183 return false;
3184
3185 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3186}
3187
3189 int *BytesRemoved) const {
3190 unsigned Count = 0;
3191 unsigned RemovedSize = 0;
3193 // Skip over artificial terminators when removing instructions.
3194 if (MI.isBranch() || MI.isReturn()) {
3195 RemovedSize += getInstSizeInBytes(MI);
3196 MI.eraseFromParent();
3197 ++Count;
3198 }
3199 }
3200
3201 if (BytesRemoved)
3202 *BytesRemoved = RemovedSize;
3203
3204 return Count;
3205}
3206
3207// Copy the flags onto the implicit condition register operand.
3209 const MachineOperand &OrigCond) {
3210 CondReg.setIsUndef(OrigCond.isUndef());
3211 CondReg.setIsKill(OrigCond.isKill());
3212}
3213
3216 MachineBasicBlock *FBB,
3218 const DebugLoc &DL,
3219 int *BytesAdded) const {
3220 if (!FBB && Cond.empty()) {
3221 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3222 .addMBB(TBB);
3223 if (BytesAdded)
3224 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3225 return 1;
3226 }
3227
3228 assert(TBB && Cond[0].isImm());
3229
3230 unsigned Opcode
3231 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3232
3233 if (!FBB) {
3234 MachineInstr *CondBr =
3235 BuildMI(&MBB, DL, get(Opcode))
3236 .addMBB(TBB);
3237
3238 // Copy the flags onto the implicit condition register operand.
3239 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3240 fixImplicitOperands(*CondBr);
3241
3242 if (BytesAdded)
3243 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3244 return 1;
3245 }
3246
3247 assert(TBB && FBB);
3248
3249 MachineInstr *CondBr =
3250 BuildMI(&MBB, DL, get(Opcode))
3251 .addMBB(TBB);
3252 fixImplicitOperands(*CondBr);
3253 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3254 .addMBB(FBB);
3255
3256 MachineOperand &CondReg = CondBr->getOperand(1);
3257 CondReg.setIsUndef(Cond[1].isUndef());
3258 CondReg.setIsKill(Cond[1].isKill());
3259
3260 if (BytesAdded)
3261 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3262
3263 return 2;
3264}
3265
3268 if (Cond.size() != 2) {
3269 return true;
3270 }
3271
3272 if (Cond[0].isImm()) {
3273 Cond[0].setImm(-Cond[0].getImm());
3274 return false;
3275 }
3276
3277 return true;
3278}
3279
3282 Register DstReg, Register TrueReg,
3283 Register FalseReg, int &CondCycles,
3284 int &TrueCycles, int &FalseCycles) const {
3285 switch (Cond[0].getImm()) {
3286 case VCCNZ:
3287 case VCCZ: {
3289 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3290 if (MRI.getRegClass(FalseReg) != RC)
3291 return false;
3292
3293 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3294 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3295
3296 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3297 return RI.hasVGPRs(RC) && NumInsts <= 6;
3298 }
3299 case SCC_TRUE:
3300 case SCC_FALSE: {
3301 // FIXME: We could insert for VGPRs if we could replace the original compare
3302 // with a vector one.
3304 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3305 if (MRI.getRegClass(FalseReg) != RC)
3306 return false;
3307
3308 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3309
3310 // Multiples of 8 can do s_cselect_b64
3311 if (NumInsts % 2 == 0)
3312 NumInsts /= 2;
3313
3314 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3315 return RI.isSGPRClass(RC);
3316 }
3317 default:
3318 return false;
3319 }
3320}
3321
3325 Register TrueReg, Register FalseReg) const {
3326 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3327 if (Pred == VCCZ || Pred == SCC_FALSE) {
3328 Pred = static_cast<BranchPredicate>(-Pred);
3329 std::swap(TrueReg, FalseReg);
3330 }
3331
3333 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3334 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3335
3336 if (DstSize == 32) {
3338 if (Pred == SCC_TRUE) {
3339 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3340 .addReg(TrueReg)
3341 .addReg(FalseReg);
3342 } else {
3343 // Instruction's operands are backwards from what is expected.
3344 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3345 .addReg(FalseReg)
3346 .addReg(TrueReg);
3347 }
3348
3349 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3350 return;
3351 }
3352
3353 if (DstSize == 64 && Pred == SCC_TRUE) {
3355 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3356 .addReg(TrueReg)
3357 .addReg(FalseReg);
3358
3359 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3360 return;
3361 }
3362
3363 static const int16_t Sub0_15[] = {
3364 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3365 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3366 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3367 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3368 };
3369
3370 static const int16_t Sub0_15_64[] = {
3371 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3372 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3373 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3374 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3375 };
3376
3377 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3378 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3379 const int16_t *SubIndices = Sub0_15;
3380 int NElts = DstSize / 32;
3381
3382 // 64-bit select is only available for SALU.
3383 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3384 if (Pred == SCC_TRUE) {
3385 if (NElts % 2) {
3386 SelOp = AMDGPU::S_CSELECT_B32;
3387 EltRC = &AMDGPU::SGPR_32RegClass;
3388 } else {
3389 SelOp = AMDGPU::S_CSELECT_B64;
3390 EltRC = &AMDGPU::SGPR_64RegClass;
3391 SubIndices = Sub0_15_64;
3392 NElts /= 2;
3393 }
3394 }
3395
3397 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3398
3399 I = MIB->getIterator();
3400
3402 for (int Idx = 0; Idx != NElts; ++Idx) {
3403 Register DstElt = MRI.createVirtualRegister(EltRC);
3404 Regs.push_back(DstElt);
3405
3406 unsigned SubIdx = SubIndices[Idx];
3407
3409 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3410 Select =
3411 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3412 .addReg(FalseReg, 0, SubIdx)
3413 .addReg(TrueReg, 0, SubIdx);
3414 } else {
3415 Select =
3416 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3417 .addReg(TrueReg, 0, SubIdx)
3418 .addReg(FalseReg, 0, SubIdx);
3419 }
3420
3421 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3423
3424 MIB.addReg(DstElt)
3425 .addImm(SubIdx);
3426 }
3427}
3428
3430 switch (MI.getOpcode()) {
3431 case AMDGPU::V_MOV_B16_t16_e32:
3432 case AMDGPU::V_MOV_B16_t16_e64:
3433 case AMDGPU::V_MOV_B32_e32:
3434 case AMDGPU::V_MOV_B32_e64:
3435 case AMDGPU::V_MOV_B64_PSEUDO:
3436 case AMDGPU::V_MOV_B64_e32:
3437 case AMDGPU::V_MOV_B64_e64:
3438 case AMDGPU::S_MOV_B32:
3439 case AMDGPU::S_MOV_B64:
3440 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3441 case AMDGPU::COPY:
3442 case AMDGPU::WWM_COPY:
3443 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3444 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3445 case AMDGPU::V_ACCVGPR_MOV_B32:
3446 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3447 return true;
3448 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3449 // TODO: We could fold this, but it's a strange case. The immediate value
3450 // can't be directly folded into any real use. We would have to spread new
3451 // immediate legality checks around and only accept subregister extracts for
3452 // profitability.
3453 default:
3454 return false;
3455 }
3456}
3457
3458static constexpr AMDGPU::OpName ModifierOpNames[] = {
3459 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3460 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3461 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3462
3464 unsigned Opc = MI.getOpcode();
3465 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3466 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3467 if (Idx >= 0)
3468 MI.removeOperand(Idx);
3469 }
3470}
3471
3472std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3473 unsigned SubRegIndex) {
3474 switch (SubRegIndex) {
3475 case AMDGPU::NoSubRegister:
3476 return Imm;
3477 case AMDGPU::sub0:
3478 return SignExtend64<32>(Imm);
3479 case AMDGPU::sub1:
3480 return SignExtend64<32>(Imm >> 32);
3481 case AMDGPU::lo16:
3482 return SignExtend64<16>(Imm);
3483 case AMDGPU::hi16:
3484 return SignExtend64<16>(Imm >> 16);
3485 case AMDGPU::sub1_lo16:
3486 return SignExtend64<16>(Imm >> 32);
3487 case AMDGPU::sub1_hi16:
3488 return SignExtend64<16>(Imm >> 48);
3489 default:
3490 return std::nullopt;
3491 }
3492
3493 llvm_unreachable("covered subregister switch");
3494}
3495
3496static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3497 switch (Opc) {
3498 case AMDGPU::V_MAC_F16_e32:
3499 case AMDGPU::V_MAC_F16_e64:
3500 case AMDGPU::V_MAD_F16_e64:
3501 return AMDGPU::V_MADAK_F16;
3502 case AMDGPU::V_MAC_F32_e32:
3503 case AMDGPU::V_MAC_F32_e64:
3504 case AMDGPU::V_MAD_F32_e64:
3505 return AMDGPU::V_MADAK_F32;
3506 case AMDGPU::V_FMAC_F32_e32:
3507 case AMDGPU::V_FMAC_F32_e64:
3508 case AMDGPU::V_FMA_F32_e64:
3509 return AMDGPU::V_FMAAK_F32;
3510 case AMDGPU::V_FMAC_F16_e32:
3511 case AMDGPU::V_FMAC_F16_e64:
3512 case AMDGPU::V_FMAC_F16_t16_e64:
3513 case AMDGPU::V_FMAC_F16_fake16_e64:
3514 case AMDGPU::V_FMA_F16_e64:
3515 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3516 ? AMDGPU::V_FMAAK_F16_t16
3517 : AMDGPU::V_FMAAK_F16_fake16
3518 : AMDGPU::V_FMAAK_F16;
3519 case AMDGPU::V_FMAC_F64_e32:
3520 case AMDGPU::V_FMAC_F64_e64:
3521 case AMDGPU::V_FMA_F64_e64:
3522 return AMDGPU::V_FMAAK_F64;
3523 default:
3524 llvm_unreachable("invalid instruction");
3525 }
3526}
3527
3528static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3529 switch (Opc) {
3530 case AMDGPU::V_MAC_F16_e32:
3531 case AMDGPU::V_MAC_F16_e64:
3532 case AMDGPU::V_MAD_F16_e64:
3533 return AMDGPU::V_MADMK_F16;
3534 case AMDGPU::V_MAC_F32_e32:
3535 case AMDGPU::V_MAC_F32_e64:
3536 case AMDGPU::V_MAD_F32_e64:
3537 return AMDGPU::V_MADMK_F32;
3538 case AMDGPU::V_FMAC_F32_e32:
3539 case AMDGPU::V_FMAC_F32_e64:
3540 case AMDGPU::V_FMA_F32_e64:
3541 return AMDGPU::V_FMAMK_F32;
3542 case AMDGPU::V_FMAC_F16_e32:
3543 case AMDGPU::V_FMAC_F16_e64:
3544 case AMDGPU::V_FMAC_F16_t16_e64:
3545 case AMDGPU::V_FMAC_F16_fake16_e64:
3546 case AMDGPU::V_FMA_F16_e64:
3547 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3548 ? AMDGPU::V_FMAMK_F16_t16
3549 : AMDGPU::V_FMAMK_F16_fake16
3550 : AMDGPU::V_FMAMK_F16;
3551 case AMDGPU::V_FMAC_F64_e32:
3552 case AMDGPU::V_FMAC_F64_e64:
3553 case AMDGPU::V_FMA_F64_e64:
3554 return AMDGPU::V_FMAMK_F64;
3555 default:
3556 llvm_unreachable("invalid instruction");
3557 }
3558}
3559
3561 Register Reg, MachineRegisterInfo *MRI) const {
3562 if (!MRI->hasOneNonDBGUse(Reg))
3563 return false;
3564
3565 int64_t Imm;
3566 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3567 return false;
3568
3569 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3570
3571 unsigned Opc = UseMI.getOpcode();
3572 if (Opc == AMDGPU::COPY) {
3573 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3574
3575 Register DstReg = UseMI.getOperand(0).getReg();
3576 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3577
3578 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3579
3580 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3581 RI.getSubRegIdxSize(UseSubReg) == 16;
3582
3583 if (Is16Bit) {
3584 if (RI.hasVGPRs(DstRC))
3585 return false; // Do not clobber vgpr_hi16
3586
3587 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3588 return false;
3589 }
3590
3591 MachineFunction *MF = UseMI.getMF();
3592
3593 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3594 MCRegister MovDstPhysReg =
3595 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3596
3597 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3598
3599 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3600 for (unsigned MovOp :
3601 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3602 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3603 const MCInstrDesc &MovDesc = get(MovOp);
3604
3605 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI, *MF);
3606 if (Is16Bit) {
3607 // We just need to find a correctly sized register class, so the
3608 // subregister index compatibility doesn't matter since we're statically
3609 // extracting the immediate value.
3610 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3611 if (!MovDstRC)
3612 continue;
3613
3614 if (MovDstPhysReg) {
3615 // FIXME: We probably should not do this. If there is a live value in
3616 // the high half of the register, it will be corrupted.
3617 MovDstPhysReg =
3618 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3619 if (!MovDstPhysReg)
3620 continue;
3621 }
3622 }
3623
3624 // Result class isn't the right size, try the next instruction.
3625 if (MovDstPhysReg) {
3626 if (!MovDstRC->contains(MovDstPhysReg))
3627 return false;
3628 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3629 // TODO: This will be overly conservative in the case of 16-bit virtual
3630 // SGPRs. We could hack up the virtual register uses to use a compatible
3631 // 32-bit class.
3632 continue;
3633 }
3634
3635 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3636
3637 // Ensure the interpreted immediate value is a valid operand in the new
3638 // mov.
3639 //
3640 // FIXME: isImmOperandLegal should have form that doesn't require existing
3641 // MachineInstr or MachineOperand
3642 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3643 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3644 break;
3645
3646 NewOpc = MovOp;
3647 break;
3648 }
3649
3650 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3651 return false;
3652
3653 if (Is16Bit) {
3654 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3655 if (MovDstPhysReg)
3656 UseMI.getOperand(0).setReg(MovDstPhysReg);
3657 assert(UseMI.getOperand(1).getReg().isVirtual());
3658 }
3659
3660 const MCInstrDesc &NewMCID = get(NewOpc);
3661 UseMI.setDesc(NewMCID);
3662 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3663 UseMI.addImplicitDefUseOperands(*MF);
3664 return true;
3665 }
3666
3667 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3668 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3669 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3670 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3671 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3672 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3673 Opc == AMDGPU::V_FMAC_F64_e64) {
3674 // Don't fold if we are using source or output modifiers. The new VOP2
3675 // instructions don't have them.
3677 return false;
3678
3679 // If this is a free constant, there's no reason to do this.
3680 // TODO: We could fold this here instead of letting SIFoldOperands do it
3681 // later.
3682 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3683
3684 // Any src operand can be used for the legality check.
3685 if (isInlineConstant(UseMI, Src0Idx, Imm))
3686 return false;
3687
3688 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3689
3690 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3691 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3692
3693 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3694 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3695 (Src1->isReg() && Src1->getReg() == Reg)) {
3696 MachineOperand *RegSrc =
3697 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3698 if (!RegSrc->isReg())
3699 return false;
3700 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3701 ST.getConstantBusLimit(Opc) < 2)
3702 return false;
3703
3704 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3705 return false;
3706
3707 // If src2 is also a literal constant then we have to choose which one to
3708 // fold. In general it is better to choose madak so that the other literal
3709 // can be materialized in an sgpr instead of a vgpr:
3710 // s_mov_b32 s0, literal
3711 // v_madak_f32 v0, s0, v0, literal
3712 // Instead of:
3713 // v_mov_b32 v1, literal
3714 // v_madmk_f32 v0, v0, literal, v1
3715 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3716 if (Def && Def->isMoveImmediate() &&
3717 !isInlineConstant(Def->getOperand(1)))
3718 return false;
3719
3720 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3721 if (pseudoToMCOpcode(NewOpc) == -1)
3722 return false;
3723
3724 // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3725 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3726 // restricting their register classes. For now just bail out.
3727 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3728 NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3729 return false;
3730
3731 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3732 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3733
3734 // FIXME: This would be a lot easier if we could return a new instruction
3735 // instead of having to modify in place.
3736
3737 Register SrcReg = RegSrc->getReg();
3738 unsigned SrcSubReg = RegSrc->getSubReg();
3739 Src0->setReg(SrcReg);
3740 Src0->setSubReg(SrcSubReg);
3741 Src0->setIsKill(RegSrc->isKill());
3742
3743 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3744 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3745 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3746 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3747 UseMI.untieRegOperand(
3748 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3749
3750 Src1->ChangeToImmediate(*SubRegImm);
3751
3753 UseMI.setDesc(get(NewOpc));
3754
3755 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3756 if (DeleteDef)
3757 DefMI.eraseFromParent();
3758
3759 return true;
3760 }
3761
3762 // Added part is the constant: Use v_madak_{f16, f32}.
3763 if (Src2->isReg() && Src2->getReg() == Reg) {
3764 if (ST.getConstantBusLimit(Opc) < 2) {
3765 // Not allowed to use constant bus for another operand.
3766 // We can however allow an inline immediate as src0.
3767 bool Src0Inlined = false;
3768 if (Src0->isReg()) {
3769 // Try to inline constant if possible.
3770 // If the Def moves immediate and the use is single
3771 // We are saving VGPR here.
3772 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3773 if (Def && Def->isMoveImmediate() &&
3774 isInlineConstant(Def->getOperand(1)) &&
3775 MRI->hasOneUse(Src0->getReg())) {
3776 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3777 Src0Inlined = true;
3778 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3779 RI.isSGPRReg(*MRI, Src0->getReg())) {
3780 return false;
3781 }
3782 // VGPR is okay as Src0 - fallthrough
3783 }
3784
3785 if (Src1->isReg() && !Src0Inlined) {
3786 // We have one slot for inlinable constant so far - try to fill it
3787 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3788 if (Def && Def->isMoveImmediate() &&
3789 isInlineConstant(Def->getOperand(1)) &&
3790 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3791 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3792 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3793 return false;
3794 // VGPR is okay as Src1 - fallthrough
3795 }
3796 }
3797
3798 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3799 if (pseudoToMCOpcode(NewOpc) == -1)
3800 return false;
3801
3802 // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3803 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3804 // restricting their register classes. For now just bail out.
3805 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3806 NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3807 return false;
3808
3809 // FIXME: This would be a lot easier if we could return a new instruction
3810 // instead of having to modify in place.
3811
3812 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3813 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3814 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3815 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3816 UseMI.untieRegOperand(
3817 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3818
3819 const std::optional<int64_t> SubRegImm =
3820 extractSubregFromImm(Imm, Src2->getSubReg());
3821
3822 // ChangingToImmediate adds Src2 back to the instruction.
3823 Src2->ChangeToImmediate(*SubRegImm);
3824
3825 // These come before src2.
3827 UseMI.setDesc(get(NewOpc));
3828 // It might happen that UseMI was commuted
3829 // and we now have SGPR as SRC1. If so 2 inlined
3830 // constant and SGPR are illegal.
3832
3833 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3834 if (DeleteDef)
3835 DefMI.eraseFromParent();
3836
3837 return true;
3838 }
3839 }
3840
3841 return false;
3842}
3843
3844static bool
3847 if (BaseOps1.size() != BaseOps2.size())
3848 return false;
3849 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3850 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3851 return false;
3852 }
3853 return true;
3854}
3855
3856static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3857 LocationSize WidthB, int OffsetB) {
3858 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3859 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3860 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3861 return LowWidth.hasValue() &&
3862 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3863}
3864
3865bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3866 const MachineInstr &MIb) const {
3867 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3868 int64_t Offset0, Offset1;
3871 bool Offset0IsScalable, Offset1IsScalable;
3872 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3873 Dummy0, &RI) ||
3874 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3875 Dummy1, &RI))
3876 return false;
3877
3878 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3879 return false;
3880
3881 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3882 // FIXME: Handle ds_read2 / ds_write2.
3883 return false;
3884 }
3885 LocationSize Width0 = MIa.memoperands().front()->getSize();
3886 LocationSize Width1 = MIb.memoperands().front()->getSize();
3887 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3888}
3889
3891 const MachineInstr &MIb) const {
3892 assert(MIa.mayLoadOrStore() &&
3893 "MIa must load from or modify a memory location");
3894 assert(MIb.mayLoadOrStore() &&
3895 "MIb must load from or modify a memory location");
3896
3898 return false;
3899
3900 // XXX - Can we relax this between address spaces?
3901 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3902 return false;
3903
3904 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3905 return false;
3906
3907 // TODO: Should we check the address space from the MachineMemOperand? That
3908 // would allow us to distinguish objects we know don't alias based on the
3909 // underlying address space, even if it was lowered to a different one,
3910 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3911 // buffer.
3912 if (isDS(MIa)) {
3913 if (isDS(MIb))
3914 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3915
3916 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3917 }
3918
3919 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3920 if (isMUBUF(MIb) || isMTBUF(MIb))
3921 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3922
3923 if (isFLAT(MIb))
3924 return isFLATScratch(MIb);
3925
3926 return !isSMRD(MIb);
3927 }
3928
3929 if (isSMRD(MIa)) {
3930 if (isSMRD(MIb))
3931 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3932
3933 if (isFLAT(MIb))
3934 return isFLATScratch(MIb);
3935
3936 return !isMUBUF(MIb) && !isMTBUF(MIb);
3937 }
3938
3939 if (isFLAT(MIa)) {
3940 if (isFLAT(MIb)) {
3941 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3942 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3943 return true;
3944
3945 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3946 }
3947
3948 return false;
3949 }
3950
3951 return false;
3952}
3953
3955 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3956 if (Reg.isPhysical())
3957 return false;
3958 auto *Def = MRI.getUniqueVRegDef(Reg);
3959 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3960 Imm = Def->getOperand(1).getImm();
3961 if (DefMI)
3962 *DefMI = Def;
3963 return true;
3964 }
3965 return false;
3966}
3967
3968static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3969 MachineInstr **DefMI = nullptr) {
3970 if (!MO->isReg())
3971 return false;
3972 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3973 const MachineRegisterInfo &MRI = MF->getRegInfo();
3974 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3975}
3976
3978 MachineInstr &NewMI) {
3979 if (LV) {
3980 unsigned NumOps = MI.getNumOperands();
3981 for (unsigned I = 1; I < NumOps; ++I) {
3982 MachineOperand &Op = MI.getOperand(I);
3983 if (Op.isReg() && Op.isKill())
3984 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3985 }
3986 }
3987}
3988
3989static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
3990 switch (Opc) {
3991 case AMDGPU::V_MAC_F16_e32:
3992 case AMDGPU::V_MAC_F16_e64:
3993 return AMDGPU::V_MAD_F16_e64;
3994 case AMDGPU::V_MAC_F32_e32:
3995 case AMDGPU::V_MAC_F32_e64:
3996 return AMDGPU::V_MAD_F32_e64;
3997 case AMDGPU::V_MAC_LEGACY_F32_e32:
3998 case AMDGPU::V_MAC_LEGACY_F32_e64:
3999 return AMDGPU::V_MAD_LEGACY_F32_e64;
4000 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4001 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4002 return AMDGPU::V_FMA_LEGACY_F32_e64;
4003 case AMDGPU::V_FMAC_F16_e32:
4004 case AMDGPU::V_FMAC_F16_e64:
4005 case AMDGPU::V_FMAC_F16_t16_e64:
4006 case AMDGPU::V_FMAC_F16_fake16_e64:
4007 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4008 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4009 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4010 : AMDGPU::V_FMA_F16_gfx9_e64;
4011 case AMDGPU::V_FMAC_F32_e32:
4012 case AMDGPU::V_FMAC_F32_e64:
4013 return AMDGPU::V_FMA_F32_e64;
4014 case AMDGPU::V_FMAC_F64_e32:
4015 case AMDGPU::V_FMAC_F64_e64:
4016 return AMDGPU::V_FMA_F64_e64;
4017 default:
4018 llvm_unreachable("invalid instruction");
4019 }
4020}
4021
4023 LiveVariables *LV,
4024 LiveIntervals *LIS) const {
4025 MachineBasicBlock &MBB = *MI.getParent();
4026 unsigned Opc = MI.getOpcode();
4027
4028 // Handle MFMA.
4029 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4030 if (NewMFMAOpc != -1) {
4032 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4033 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4034 MIB.add(MI.getOperand(I));
4035 updateLiveVariables(LV, MI, *MIB);
4036 if (LIS) {
4037 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4038 // SlotIndex of defs needs to be updated when converting to early-clobber
4039 MachineOperand &Def = MIB->getOperand(0);
4040 if (Def.isEarlyClobber() && Def.isReg() &&
4041 LIS->hasInterval(Def.getReg())) {
4042 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
4043 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
4044 auto &LI = LIS->getInterval(Def.getReg());
4045 auto UpdateDefIndex = [&](LiveRange &LR) {
4046 auto *S = LR.find(OldIndex);
4047 if (S != LR.end() && S->start == OldIndex) {
4048 assert(S->valno && S->valno->def == OldIndex);
4049 S->start = NewIndex;
4050 S->valno->def = NewIndex;
4051 }
4052 };
4053 UpdateDefIndex(LI);
4054 for (auto &SR : LI.subranges())
4055 UpdateDefIndex(SR);
4056 }
4057 }
4058 return MIB;
4059 }
4060
4061 if (SIInstrInfo::isWMMA(MI)) {
4062 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4063 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4064 .setMIFlags(MI.getFlags());
4065 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4066 MIB->addOperand(MI.getOperand(I));
4067
4068 updateLiveVariables(LV, MI, *MIB);
4069 if (LIS)
4070 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4071
4072 return MIB;
4073 }
4074
4075 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4076 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4077 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4078 "present pre-RA");
4079
4080 // Handle MAC/FMAC.
4081 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4082 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4083 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4084 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4085 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4086 bool Src0Literal = false;
4087
4088 switch (Opc) {
4089 default:
4090 return nullptr;
4091 case AMDGPU::V_MAC_F16_e64:
4092 case AMDGPU::V_FMAC_F16_e64:
4093 case AMDGPU::V_FMAC_F16_t16_e64:
4094 case AMDGPU::V_FMAC_F16_fake16_e64:
4095 case AMDGPU::V_MAC_F32_e64:
4096 case AMDGPU::V_MAC_LEGACY_F32_e64:
4097 case AMDGPU::V_FMAC_F32_e64:
4098 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4099 case AMDGPU::V_FMAC_F64_e64:
4100 break;
4101 case AMDGPU::V_MAC_F16_e32:
4102 case AMDGPU::V_FMAC_F16_e32:
4103 case AMDGPU::V_MAC_F32_e32:
4104 case AMDGPU::V_MAC_LEGACY_F32_e32:
4105 case AMDGPU::V_FMAC_F32_e32:
4106 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4107 case AMDGPU::V_FMAC_F64_e32: {
4108 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4109 AMDGPU::OpName::src0);
4110 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4111 if (!Src0->isReg() && !Src0->isImm())
4112 return nullptr;
4113
4114 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4115 Src0Literal = true;
4116
4117 break;
4118 }
4119 }
4120
4122 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4123 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4124 const MachineOperand *Src0Mods =
4125 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4126 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4127 const MachineOperand *Src1Mods =
4128 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4129 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4130 const MachineOperand *Src2Mods =
4131 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4132 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4133 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4134 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4135
4136 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4137 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4138 // If we have an SGPR input, we will violate the constant bus restriction.
4139 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4140 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4142 const auto killDef = [&]() -> void {
4144 // The only user is the instruction which will be killed.
4145 Register DefReg = DefMI->getOperand(0).getReg();
4146
4147 if (MRI.hasOneNonDBGUse(DefReg)) {
4148 // We cannot just remove the DefMI here, calling pass will crash.
4149 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
4150 DefMI->getOperand(0).setIsDead(true);
4151 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
4153 if (LV)
4154 LV->getVarInfo(DefReg).AliveBlocks.clear();
4155 }
4156
4157 if (LIS) {
4158 LiveInterval &DefLI = LIS->getInterval(DefReg);
4159
4160 // We cannot delete the original instruction here, so hack out the use
4161 // in the original instruction with a dummy register so we can use
4162 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4163 // not have the complexity of deleting a use to consider here.
4164 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4165 for (MachineOperand &MIOp : MI.uses()) {
4166 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4167 MIOp.setIsUndef(true);
4168 MIOp.setReg(DummyReg);
4169 }
4170 }
4171
4172 LIS->shrinkToUses(&DefLI);
4173 }
4174 };
4175
4176 int64_t Imm;
4177 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4178 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4179 if (pseudoToMCOpcode(NewOpc) != -1) {
4180 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4181 .add(*Dst)
4182 .add(*Src0)
4183 .add(*Src1)
4184 .addImm(Imm)
4185 .setMIFlags(MI.getFlags());
4186 updateLiveVariables(LV, MI, *MIB);
4187 if (LIS)
4188 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4189 killDef();
4190 return MIB;
4191 }
4192 }
4193 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4194 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4195 if (pseudoToMCOpcode(NewOpc) != -1) {
4196 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4197 .add(*Dst)
4198 .add(*Src0)
4199 .addImm(Imm)
4200 .add(*Src2)
4201 .setMIFlags(MI.getFlags());
4202 updateLiveVariables(LV, MI, *MIB);
4203
4204 if (LIS)
4205 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4206 killDef();
4207 return MIB;
4208 }
4209 }
4210 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4211 if (Src0Literal) {
4212 Imm = Src0->getImm();
4213 DefMI = nullptr;
4214 }
4215 if (pseudoToMCOpcode(NewOpc) != -1 &&
4217 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4218 Src1)) {
4219 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4220 .add(*Dst)
4221 .add(*Src1)
4222 .addImm(Imm)
4223 .add(*Src2)
4224 .setMIFlags(MI.getFlags());
4225 updateLiveVariables(LV, MI, *MIB);
4226
4227 if (LIS)
4228 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4229 if (DefMI)
4230 killDef();
4231 return MIB;
4232 }
4233 }
4234 }
4235
4236 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4237 // if VOP3 does not allow a literal operand.
4238 if (Src0Literal && !ST.hasVOP3Literal())
4239 return nullptr;
4240
4241 unsigned NewOpc = getNewFMAInst(ST, Opc);
4242
4243 if (pseudoToMCOpcode(NewOpc) == -1)
4244 return nullptr;
4245
4246 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4247 .add(*Dst)
4248 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4249 .add(*Src0)
4250 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4251 .add(*Src1)
4252 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4253 .add(*Src2)
4254 .addImm(Clamp ? Clamp->getImm() : 0)
4255 .addImm(Omod ? Omod->getImm() : 0)
4256 .setMIFlags(MI.getFlags());
4257 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4258 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4259 updateLiveVariables(LV, MI, *MIB);
4260 if (LIS)
4261 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4262 return MIB;
4263}
4264
4265// It's not generally safe to move VALU instructions across these since it will
4266// start using the register as a base index rather than directly.
4267// XXX - Why isn't hasSideEffects sufficient for these?
4269 switch (MI.getOpcode()) {
4270 case AMDGPU::S_SET_GPR_IDX_ON:
4271 case AMDGPU::S_SET_GPR_IDX_MODE:
4272 case AMDGPU::S_SET_GPR_IDX_OFF:
4273 return true;
4274 default:
4275 return false;
4276 }
4277}
4278
4280 const MachineBasicBlock *MBB,
4281 const MachineFunction &MF) const {
4282 // Skipping the check for SP writes in the base implementation. The reason it
4283 // was added was apparently due to compile time concerns.
4284 //
4285 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4286 // but is probably avoidable.
4287
4288 // Copied from base implementation.
4289 // Terminators and labels can't be scheduled around.
4290 if (MI.isTerminator() || MI.isPosition())
4291 return true;
4292
4293 // INLINEASM_BR can jump to another block
4294 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4295 return true;
4296
4297 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4298 return true;
4299
4300 // Target-independent instructions do not have an implicit-use of EXEC, even
4301 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4302 // boundaries prevents incorrect movements of such instructions.
4303 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4304 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4305 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4306 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4307 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4309}
4310
4312 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4313 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4314 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4315}
4316
4318 if (!isFLAT(MI) || isFLATGlobal(MI))
4319 return false;
4320
4321 // If scratch is not initialized, we can never access it.
4322 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4323 return false;
4324
4325 // SCRATCH instructions always access scratch.
4326 if (isFLATScratch(MI))
4327 return true;
4328
4329 // If there are no memory operands then conservatively assume the flat
4330 // operation may access scratch.
4331 if (MI.memoperands_empty())
4332 return true;
4333
4334 // See if any memory operand specifies an address space that involves scratch.
4335 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4336 unsigned AS = Memop->getAddrSpace();
4337 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4338 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4339 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4340 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4341 }
4342 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4343 });
4344}
4345
4347 // Skip the full operand and register alias search modifiesRegister
4348 // does. There's only a handful of instructions that touch this, it's only an
4349 // implicit def, and doesn't alias any other registers.
4350 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4351}
4352
4354 unsigned Opcode = MI.getOpcode();
4355
4356 if (MI.mayStore() && isSMRD(MI))
4357 return true; // scalar store or atomic
4358
4359 // This will terminate the function when other lanes may need to continue.
4360 if (MI.isReturn())
4361 return true;
4362
4363 // These instructions cause shader I/O that may cause hardware lockups
4364 // when executed with an empty EXEC mask.
4365 //
4366 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4367 // EXEC = 0, but checking for that case here seems not worth it
4368 // given the typical code patterns.
4369 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4370 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4371 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4372 return true;
4373
4374 if (MI.isCall() || MI.isInlineAsm())
4375 return true; // conservative assumption
4376
4377 // Assume that barrier interactions are only intended with active lanes.
4378 if (isBarrier(Opcode))
4379 return true;
4380
4381 // A mode change is a scalar operation that influences vector instructions.
4383 return true;
4384
4385 // These are like SALU instructions in terms of effects, so it's questionable
4386 // whether we should return true for those.
4387 //
4388 // However, executing them with EXEC = 0 causes them to operate on undefined
4389 // data, which we avoid by returning true here.
4390 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4391 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4392 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4393 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4394 return true;
4395
4396 return false;
4397}
4398
4400 const MachineInstr &MI) const {
4401 if (MI.isMetaInstruction())
4402 return false;
4403
4404 // This won't read exec if this is an SGPR->SGPR copy.
4405 if (MI.isCopyLike()) {
4406 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4407 return true;
4408
4409 // Make sure this isn't copying exec as a normal operand
4410 return MI.readsRegister(AMDGPU::EXEC, &RI);
4411 }
4412
4413 // Make a conservative assumption about the callee.
4414 if (MI.isCall())
4415 return true;
4416
4417 // Be conservative with any unhandled generic opcodes.
4418 if (!isTargetSpecificOpcode(MI.getOpcode()))
4419 return true;
4420
4421 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4422}
4423
4424bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4425 switch (Imm.getBitWidth()) {
4426 case 1: // This likely will be a condition code mask.
4427 return true;
4428
4429 case 32:
4430 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4431 ST.hasInv2PiInlineImm());
4432 case 64:
4433 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4434 ST.hasInv2PiInlineImm());
4435 case 16:
4436 return ST.has16BitInsts() &&
4437 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4438 ST.hasInv2PiInlineImm());
4439 default:
4440 llvm_unreachable("invalid bitwidth");
4441 }
4442}
4443
4445 APInt IntImm = Imm.bitcastToAPInt();
4446 int64_t IntImmVal = IntImm.getSExtValue();
4447 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4448 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4449 default:
4450 llvm_unreachable("invalid fltSemantics");
4453 return isInlineConstant(IntImm);
4455 return ST.has16BitInsts() &&
4456 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4458 return ST.has16BitInsts() &&
4459 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4460 }
4461}
4462
4463bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4464 // MachineOperand provides no way to tell the true operand size, since it only
4465 // records a 64-bit value. We need to know the size to determine if a 32-bit
4466 // floating point immediate bit pattern is legal for an integer immediate. It
4467 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4468 switch (OperandType) {
4478 int32_t Trunc = static_cast<int32_t>(Imm);
4480 }
4489 // We would expect inline immediates to not be concerned with an integer/fp
4490 // distinction. However, in the case of 16-bit integer operations, the
4491 // "floating point" values appear to not work. It seems read the low 16-bits
4492 // of 32-bit immediates, which happens to always work for the integer
4493 // values.
4494 //
4495 // See llvm bugzilla 46302.
4496 //
4497 // TODO: Theoretically we could use op-sel to use the high bits of the
4498 // 32-bit FP values.
4510 return false;
4513 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4514 // A few special case instructions have 16-bit operands on subtargets
4515 // where 16-bit instructions are not legal.
4516 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4517 // constants in these cases
4518 int16_t Trunc = static_cast<int16_t>(Imm);
4519 return ST.has16BitInsts() &&
4521 }
4522
4523 return false;
4524 }
4527 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4528 int16_t Trunc = static_cast<int16_t>(Imm);
4529 return ST.has16BitInsts() &&
4531 }
4532 return false;
4533 }
4537 return false;
4539 return isLegalAV64PseudoImm(Imm);
4542 // Always embedded in the instruction for free.
4543 return true;
4553 // Just ignore anything else.
4554 return true;
4555 default:
4556 llvm_unreachable("invalid operand type");
4557 }
4558}
4559
4560static bool compareMachineOp(const MachineOperand &Op0,
4561 const MachineOperand &Op1) {
4562 if (Op0.getType() != Op1.getType())
4563 return false;
4564
4565 switch (Op0.getType()) {
4567 return Op0.getReg() == Op1.getReg();
4569 return Op0.getImm() == Op1.getImm();
4570 default:
4571 llvm_unreachable("Didn't expect to be comparing these operand types");
4572 }
4573}
4574
4576 const MachineOperand &MO) const {
4577 const MCInstrDesc &InstDesc = MI.getDesc();
4578 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4579
4580 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4581
4583 return true;
4584
4585 if (OpInfo.RegClass < 0)
4586 return false;
4587
4588 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4589 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4590 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4591 AMDGPU::OpName::src2))
4592 return false;
4593 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4594 }
4595
4596 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4597 return false;
4598
4599 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
4600 return true;
4601
4602 return ST.hasVOP3Literal();
4603}
4604
4606 // 2 32-bit inline constants packed into one.
4609}
4610
4611bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4612 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4613 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4614 return false;
4615
4616 int Op32 = AMDGPU::getVOPe32(Opcode);
4617 if (Op32 == -1)
4618 return false;
4619
4620 return pseudoToMCOpcode(Op32) != -1;
4621}
4622
4623bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4624 // The src0_modifier operand is present on all instructions
4625 // that have modifiers.
4626
4627 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4628}
4629
4631 AMDGPU::OpName OpName) const {
4632 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4633 return Mods && Mods->getImm();
4634}
4635
4637 return any_of(ModifierOpNames,
4638 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4639}
4640
4642 const MachineRegisterInfo &MRI) const {
4643 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4644 // Can't shrink instruction with three operands.
4645 if (Src2) {
4646 switch (MI.getOpcode()) {
4647 default: return false;
4648
4649 case AMDGPU::V_ADDC_U32_e64:
4650 case AMDGPU::V_SUBB_U32_e64:
4651 case AMDGPU::V_SUBBREV_U32_e64: {
4652 const MachineOperand *Src1
4653 = getNamedOperand(MI, AMDGPU::OpName::src1);
4654 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4655 return false;
4656 // Additional verification is needed for sdst/src2.
4657 return true;
4658 }
4659 case AMDGPU::V_MAC_F16_e64:
4660 case AMDGPU::V_MAC_F32_e64:
4661 case AMDGPU::V_MAC_LEGACY_F32_e64:
4662 case AMDGPU::V_FMAC_F16_e64:
4663 case AMDGPU::V_FMAC_F16_t16_e64:
4664 case AMDGPU::V_FMAC_F16_fake16_e64:
4665 case AMDGPU::V_FMAC_F32_e64:
4666 case AMDGPU::V_FMAC_F64_e64:
4667 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4668 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4669 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4670 return false;
4671 break;
4672
4673 case AMDGPU::V_CNDMASK_B32_e64:
4674 break;
4675 }
4676 }
4677
4678 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4679 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4680 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4681 return false;
4682
4683 // We don't need to check src0, all input types are legal, so just make sure
4684 // src0 isn't using any modifiers.
4685 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4686 return false;
4687
4688 // Can it be shrunk to a valid 32 bit opcode?
4689 if (!hasVALU32BitEncoding(MI.getOpcode()))
4690 return false;
4691
4692 // Check output modifiers
4693 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4694 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4695 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4696 // TODO: Can we avoid checking bound_ctrl/fi here?
4697 // They are only used by permlane*_swap special case.
4698 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4699 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4700}
4701
4702// Set VCC operand with all flags from \p Orig, except for setting it as
4703// implicit.
4705 const MachineOperand &Orig) {
4706
4707 for (MachineOperand &Use : MI.implicit_operands()) {
4708 if (Use.isUse() &&
4709 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4710 Use.setIsUndef(Orig.isUndef());
4711 Use.setIsKill(Orig.isKill());
4712 return;
4713 }
4714 }
4715}
4716
4718 unsigned Op32) const {
4719 MachineBasicBlock *MBB = MI.getParent();
4720
4721 const MCInstrDesc &Op32Desc = get(Op32);
4722 MachineInstrBuilder Inst32 =
4723 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4724 .setMIFlags(MI.getFlags());
4725
4726 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4727 // For VOPC instructions, this is replaced by an implicit def of vcc.
4728
4729 // We assume the defs of the shrunk opcode are in the same order, and the
4730 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4731 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4732 Inst32.add(MI.getOperand(I));
4733
4734 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4735
4736 int Idx = MI.getNumExplicitDefs();
4737 for (const MachineOperand &Use : MI.explicit_uses()) {
4738 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4740 continue;
4741
4742 if (&Use == Src2) {
4743 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4744 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4745 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4746 // of vcc was already added during the initial BuildMI, but we
4747 // 1) may need to change vcc to vcc_lo to preserve the original register
4748 // 2) have to preserve the original flags.
4749 copyFlagsToImplicitVCC(*Inst32, *Src2);
4750 continue;
4751 }
4752 }
4753
4754 Inst32.add(Use);
4755 }
4756
4757 // FIXME: Losing implicit operands
4758 fixImplicitOperands(*Inst32);
4759 return Inst32;
4760}
4761
4763 const MachineOperand &MO,
4764 const MCOperandInfo &OpInfo) const {
4765 // Literal constants use the constant bus.
4766 if (!MO.isReg())
4767 return !isInlineConstant(MO, OpInfo);
4768
4769 if (!MO.isUse())
4770 return false;
4771
4772 if (MO.getReg().isVirtual())
4773 return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
4774
4775 // Null is free
4776 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4777 return false;
4778
4779 // SGPRs use the constant bus
4780 if (MO.isImplicit()) {
4781 return MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
4782 MO.getReg() == AMDGPU::VCC_LO;
4783 }
4784 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
4785 AMDGPU::SReg_64RegClass.contains(MO.getReg());
4786}
4787
4789 for (const MachineOperand &MO : MI.implicit_operands()) {
4790 // We only care about reads.
4791 if (MO.isDef())
4792 continue;
4793
4794 switch (MO.getReg()) {
4795 case AMDGPU::VCC:
4796 case AMDGPU::VCC_LO:
4797 case AMDGPU::VCC_HI:
4798 case AMDGPU::M0:
4799 case AMDGPU::FLAT_SCR:
4800 return MO.getReg();
4801
4802 default:
4803 break;
4804 }
4805 }
4806
4807 return Register();
4808}
4809
4810static bool shouldReadExec(const MachineInstr &MI) {
4811 if (SIInstrInfo::isVALU(MI)) {
4812 switch (MI.getOpcode()) {
4813 case AMDGPU::V_READLANE_B32:
4814 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4815 case AMDGPU::V_WRITELANE_B32:
4816 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4817 return false;
4818 }
4819
4820 return true;
4821 }
4822
4823 if (MI.isPreISelOpcode() ||
4824 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4827 return false;
4828
4829 return true;
4830}
4831
4832static bool isRegOrFI(const MachineOperand &MO) {
4833 return MO.isReg() || MO.isFI();
4834}
4835
4836static bool isSubRegOf(const SIRegisterInfo &TRI,
4837 const MachineOperand &SuperVec,
4838 const MachineOperand &SubReg) {
4839 if (SubReg.getReg().isPhysical())
4840 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4841
4842 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4843 SubReg.getReg() == SuperVec.getReg();
4844}
4845
4846// Verify the illegal copy from vector register to SGPR for generic opcode COPY
4847bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4848 const MachineRegisterInfo &MRI,
4849 StringRef &ErrInfo) const {
4850 Register DstReg = MI.getOperand(0).getReg();
4851 Register SrcReg = MI.getOperand(1).getReg();
4852 // This is a check for copy from vector register to SGPR
4853 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
4854 ErrInfo = "illegal copy from vector register to SGPR";
4855 return false;
4856 }
4857 return true;
4858}
4859
4861 StringRef &ErrInfo) const {
4862 uint16_t Opcode = MI.getOpcode();
4863 const MachineFunction *MF = MI.getParent()->getParent();
4864 const MachineRegisterInfo &MRI = MF->getRegInfo();
4865
4866 // FIXME: At this point the COPY verify is done only for non-ssa forms.
4867 // Find a better property to recognize the point where instruction selection
4868 // is just done.
4869 // We can only enforce this check after SIFixSGPRCopies pass so that the
4870 // illegal copies are legalized and thereafter we don't expect a pass
4871 // inserting similar copies.
4872 if (!MRI.isSSA() && MI.isCopy())
4873 return verifyCopy(MI, MRI, ErrInfo);
4874
4875 if (SIInstrInfo::isGenericOpcode(Opcode))
4876 return true;
4877
4878 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4879 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4880 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4881 int Src3Idx = -1;
4882 if (Src0Idx == -1) {
4883 // VOPD V_DUAL_* instructions use different operand names.
4884 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4885 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4886 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4887 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4888 }
4889
4890 // Make sure the number of operands is correct.
4891 const MCInstrDesc &Desc = get(Opcode);
4892 if (!Desc.isVariadic() &&
4893 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4894 ErrInfo = "Instruction has wrong number of operands.";
4895 return false;
4896 }
4897
4898 if (MI.isInlineAsm()) {
4899 // Verify register classes for inlineasm constraints.
4900 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4901 I != E; ++I) {
4902 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4903 if (!RC)
4904 continue;
4905
4906 const MachineOperand &Op = MI.getOperand(I);
4907 if (!Op.isReg())
4908 continue;
4909
4910 Register Reg = Op.getReg();
4911 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4912 ErrInfo = "inlineasm operand has incorrect register class.";
4913 return false;
4914 }
4915 }
4916
4917 return true;
4918 }
4919
4920 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4921 ErrInfo = "missing memory operand from image instruction.";
4922 return false;
4923 }
4924
4925 // Make sure the register classes are correct.
4926 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4927 const MachineOperand &MO = MI.getOperand(i);
4928 if (MO.isFPImm()) {
4929 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4930 "all fp values to integers.";
4931 return false;
4932 }
4933
4934 int RegClass = Desc.operands()[i].RegClass;
4935
4936 const MCOperandInfo &OpInfo = Desc.operands()[i];
4937 switch (OpInfo.OperandType) {
4939 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4940 ErrInfo = "Illegal immediate value for operand.";
4941 return false;
4942 }
4943 break;
4956 break;
4958 break;
4959 break;
4973 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4974 ErrInfo = "Illegal immediate value for operand.";
4975 return false;
4976 }
4977 break;
4978 }
4980 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
4981 ErrInfo = "Expected inline constant for operand.";
4982 return false;
4983 }
4984 break;
4988 break;
4993 // Check if this operand is an immediate.
4994 // FrameIndex operands will be replaced by immediates, so they are
4995 // allowed.
4996 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4997 ErrInfo = "Expected immediate, but got non-immediate";
4998 return false;
4999 }
5000 break;
5004 break;
5005 default:
5006 if (OpInfo.isGenericType())
5007 continue;
5008 break;
5009 }
5010
5011 if (!MO.isReg())
5012 continue;
5013 Register Reg = MO.getReg();
5014 if (!Reg)
5015 continue;
5016
5017 // FIXME: Ideally we would have separate instruction definitions with the
5018 // aligned register constraint.
5019 // FIXME: We do not verify inline asm operands, but custom inline asm
5020 // verification is broken anyway
5021 if (ST.needsAlignedVGPRs()) {
5022 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5023 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5024 if (const TargetRegisterClass *SubRC =
5025 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5026 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5027 if (RC)
5028 RC = SubRC;
5029 }
5030 }
5031
5032 // Check that this is the aligned version of the class.
5033 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5034 ErrInfo = "Subtarget requires even aligned vector registers";
5035 return false;
5036 }
5037 }
5038
5039 if (RegClass != -1) {
5040 if (Reg.isVirtual())
5041 continue;
5042
5043 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5044 if (!RC->contains(Reg)) {
5045 ErrInfo = "Operand has incorrect register class.";
5046 return false;
5047 }
5048 }
5049 }
5050
5051 // Verify SDWA
5052 if (isSDWA(MI)) {
5053 if (!ST.hasSDWA()) {
5054 ErrInfo = "SDWA is not supported on this target";
5055 return false;
5056 }
5057
5058 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5059 AMDGPU::OpName::dst_sel}) {
5060 const MachineOperand *MO = getNamedOperand(MI, Op);
5061 if (!MO)
5062 continue;
5063 int64_t Imm = MO->getImm();
5064 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5065 ErrInfo = "Invalid SDWA selection";
5066 return false;
5067 }
5068 }
5069
5070 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5071
5072 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5073 if (OpIdx == -1)
5074 continue;
5075 const MachineOperand &MO = MI.getOperand(OpIdx);
5076
5077 if (!ST.hasSDWAScalar()) {
5078 // Only VGPRS on VI
5079 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5080 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5081 return false;
5082 }
5083 } else {
5084 // No immediates on GFX9
5085 if (!MO.isReg()) {
5086 ErrInfo =
5087 "Only reg allowed as operands in SDWA instructions on GFX9+";
5088 return false;
5089 }
5090 }
5091 }
5092
5093 if (!ST.hasSDWAOmod()) {
5094 // No omod allowed on VI
5095 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5096 if (OMod != nullptr &&
5097 (!OMod->isImm() || OMod->getImm() != 0)) {
5098 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5099 return false;
5100 }
5101 }
5102
5103 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5104 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5105 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5106 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5107 const MachineOperand *Src0ModsMO =
5108 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5109 unsigned Mods = Src0ModsMO->getImm();
5110 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5111 Mods & SISrcMods::SEXT) {
5112 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5113 return false;
5114 }
5115 }
5116
5117 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5118 if (isVOPC(BasicOpcode)) {
5119 if (!ST.hasSDWASdst() && DstIdx != -1) {
5120 // Only vcc allowed as dst on VI for VOPC
5121 const MachineOperand &Dst = MI.getOperand(DstIdx);
5122 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5123 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5124 return false;
5125 }
5126 } else if (!ST.hasSDWAOutModsVOPC()) {
5127 // No clamp allowed on GFX9 for VOPC
5128 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5129 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5130 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5131 return false;
5132 }
5133
5134 // No omod allowed on GFX9 for VOPC
5135 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5136 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5137 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5138 return false;
5139 }
5140 }
5141 }
5142
5143 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5144 if (DstUnused && DstUnused->isImm() &&
5145 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5146 const MachineOperand &Dst = MI.getOperand(DstIdx);
5147 if (!Dst.isReg() || !Dst.isTied()) {
5148 ErrInfo = "Dst register should have tied register";
5149 return false;
5150 }
5151
5152 const MachineOperand &TiedMO =
5153 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5154 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5155 ErrInfo =
5156 "Dst register should be tied to implicit use of preserved register";
5157 return false;
5158 }
5159 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5160 ErrInfo = "Dst register should use same physical register as preserved";
5161 return false;
5162 }
5163 }
5164 }
5165
5166 // Verify MIMG / VIMAGE / VSAMPLE
5167 if (isImage(Opcode) && !MI.mayStore()) {
5168 // Ensure that the return type used is large enough for all the options
5169 // being used TFE/LWE require an extra result register.
5170 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5171 if (DMask) {
5172 uint64_t DMaskImm = DMask->getImm();
5173 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5174 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5175 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5176 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5177
5178 // Adjust for packed 16 bit values
5179 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5180 RegCount = divideCeil(RegCount, 2);
5181
5182 // Adjust if using LWE or TFE
5183 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5184 RegCount += 1;
5185
5186 const uint32_t DstIdx =
5187 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5188 const MachineOperand &Dst = MI.getOperand(DstIdx);
5189 if (Dst.isReg()) {
5190 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5191 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5192 if (RegCount > DstSize) {
5193 ErrInfo = "Image instruction returns too many registers for dst "
5194 "register class";
5195 return false;
5196 }
5197 }
5198 }
5199 }
5200
5201 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5202 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5203 unsigned ConstantBusCount = 0;
5204 bool UsesLiteral = false;
5205 const MachineOperand *LiteralVal = nullptr;
5206
5207 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5208 if (ImmIdx != -1) {
5209 ++ConstantBusCount;
5210 UsesLiteral = true;
5211 LiteralVal = &MI.getOperand(ImmIdx);
5212 }
5213
5214 SmallVector<Register, 2> SGPRsUsed;
5215 Register SGPRUsed;
5216
5217 // Only look at the true operands. Only a real operand can use the constant
5218 // bus, and we don't want to check pseudo-operands like the source modifier
5219 // flags.
5220 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5221 if (OpIdx == -1)
5222 continue;
5223 const MachineOperand &MO = MI.getOperand(OpIdx);
5224 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5225 if (MO.isReg()) {
5226 SGPRUsed = MO.getReg();
5227 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5228 ++ConstantBusCount;
5229 SGPRsUsed.push_back(SGPRUsed);
5230 }
5231 } else if (!MO.isFI()) { // Treat FI like a register.
5232 if (!UsesLiteral) {
5233 ++ConstantBusCount;
5234 UsesLiteral = true;
5235 LiteralVal = &MO;
5236 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5237 assert(isVOP2(MI) || isVOP3(MI));
5238 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5239 return false;
5240 }
5241 }
5242 }
5243 }
5244
5245 SGPRUsed = findImplicitSGPRRead(MI);
5246 if (SGPRUsed) {
5247 // Implicit uses may safely overlap true operands
5248 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5249 return !RI.regsOverlap(SGPRUsed, SGPR);
5250 })) {
5251 ++ConstantBusCount;
5252 SGPRsUsed.push_back(SGPRUsed);
5253 }
5254 }
5255
5256 // v_writelane_b32 is an exception from constant bus restriction:
5257 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5258 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5259 Opcode != AMDGPU::V_WRITELANE_B32) {
5260 ErrInfo = "VOP* instruction violates constant bus restriction";
5261 return false;
5262 }
5263
5264 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5265 ErrInfo = "VOP3 instruction uses literal";
5266 return false;
5267 }
5268 }
5269
5270 // Special case for writelane - this can break the multiple constant bus rule,
5271 // but still can't use more than one SGPR register
5272 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5273 unsigned SGPRCount = 0;
5274 Register SGPRUsed;
5275
5276 for (int OpIdx : {Src0Idx, Src1Idx}) {
5277 if (OpIdx == -1)
5278 break;
5279
5280 const MachineOperand &MO = MI.getOperand(OpIdx);
5281
5282 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5283 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5284 if (MO.getReg() != SGPRUsed)
5285 ++SGPRCount;
5286 SGPRUsed = MO.getReg();
5287 }
5288 }
5289 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5290 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5291 return false;
5292 }
5293 }
5294 }
5295
5296 // Verify misc. restrictions on specific instructions.
5297 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5298 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5299 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5300 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5301 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5302 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5303 if (!compareMachineOp(Src0, Src1) &&
5304 !compareMachineOp(Src0, Src2)) {
5305 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5306 return false;
5307 }
5308 }
5309 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5310 SISrcMods::ABS) ||
5311 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5312 SISrcMods::ABS) ||
5313 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5314 SISrcMods::ABS)) {
5315 ErrInfo = "ABS not allowed in VOP3B instructions";
5316 return false;
5317 }
5318 }
5319
5320 if (isSOP2(MI) || isSOPC(MI)) {
5321 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5322 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5323
5324 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5325 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5326 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5327 !Src0.isIdenticalTo(Src1)) {
5328 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5329 return false;
5330 }
5331 }
5332
5333 if (isSOPK(MI)) {
5334 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5335 if (Desc.isBranch()) {
5336 if (!Op->isMBB()) {
5337 ErrInfo = "invalid branch target for SOPK instruction";
5338 return false;
5339 }
5340 } else {
5341 uint64_t Imm = Op->getImm();
5342 if (sopkIsZext(Opcode)) {
5343 if (!isUInt<16>(Imm)) {
5344 ErrInfo = "invalid immediate for SOPK instruction";
5345 return false;
5346 }
5347 } else {
5348 if (!isInt<16>(Imm)) {
5349 ErrInfo = "invalid immediate for SOPK instruction";
5350 return false;
5351 }
5352 }
5353 }
5354 }
5355
5356 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5357 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5358 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5359 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5360 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5361 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5362
5363 const unsigned StaticNumOps =
5364 Desc.getNumOperands() + Desc.implicit_uses().size();
5365 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5366
5367 // Allow additional implicit operands. This allows a fixup done by the post
5368 // RA scheduler where the main implicit operand is killed and implicit-defs
5369 // are added for sub-registers that remain live after this instruction.
5370 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5371 ErrInfo = "missing implicit register operands";
5372 return false;
5373 }
5374
5375 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5376 if (IsDst) {
5377 if (!Dst->isUse()) {
5378 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5379 return false;
5380 }
5381
5382 unsigned UseOpIdx;
5383 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5384 UseOpIdx != StaticNumOps + 1) {
5385 ErrInfo = "movrel implicit operands should be tied";
5386 return false;
5387 }
5388 }
5389
5390 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5391 const MachineOperand &ImpUse
5392 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5393 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5394 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5395 ErrInfo = "src0 should be subreg of implicit vector use";
5396 return false;
5397 }
5398 }
5399
5400 // Make sure we aren't losing exec uses in the td files. This mostly requires
5401 // being careful when using let Uses to try to add other use registers.
5402 if (shouldReadExec(MI)) {
5403 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5404 ErrInfo = "VALU instruction does not implicitly read exec mask";
5405 return false;
5406 }
5407 }
5408
5409 if (isSMRD(MI)) {
5410 if (MI.mayStore() &&
5412 // The register offset form of scalar stores may only use m0 as the
5413 // soffset register.
5414 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5415 if (Soff && Soff->getReg() != AMDGPU::M0) {
5416 ErrInfo = "scalar stores must use m0 as offset register";
5417 return false;
5418 }
5419 }
5420 }
5421
5422 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5423 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5424 if (Offset->getImm() != 0) {
5425 ErrInfo = "subtarget does not support offsets in flat instructions";
5426 return false;
5427 }
5428 }
5429
5430 if (isDS(MI) && !ST.hasGDS()) {
5431 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5432 if (GDSOp && GDSOp->getImm() != 0) {
5433 ErrInfo = "GDS is not supported on this subtarget";
5434 return false;
5435 }
5436 }
5437
5438 if (isImage(MI)) {
5439 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5440 if (DimOp) {
5441 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5442 AMDGPU::OpName::vaddr0);
5443 AMDGPU::OpName RSrcOpName =
5444 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5445 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5446 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5447 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5449 const AMDGPU::MIMGDimInfo *Dim =
5451
5452 if (!Dim) {
5453 ErrInfo = "dim is out of range";
5454 return false;
5455 }
5456
5457 bool IsA16 = false;
5458 if (ST.hasR128A16()) {
5459 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5460 IsA16 = R128A16->getImm() != 0;
5461 } else if (ST.hasA16()) {
5462 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5463 IsA16 = A16->getImm() != 0;
5464 }
5465
5466 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5467
5468 unsigned AddrWords =
5469 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5470
5471 unsigned VAddrWords;
5472 if (IsNSA) {
5473 VAddrWords = RsrcIdx - VAddr0Idx;
5474 if (ST.hasPartialNSAEncoding() &&
5475 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5476 unsigned LastVAddrIdx = RsrcIdx - 1;
5477 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5478 }
5479 } else {
5480 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5481 if (AddrWords > 12)
5482 AddrWords = 16;
5483 }
5484
5485 if (VAddrWords != AddrWords) {
5486 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5487 << " but got " << VAddrWords << "\n");
5488 ErrInfo = "bad vaddr size";
5489 return false;
5490 }
5491 }
5492 }
5493
5494 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5495 if (DppCt) {
5496 using namespace AMDGPU::DPP;
5497
5498 unsigned DC = DppCt->getImm();
5499 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5500 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5501 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5502 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5503 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5504 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5505 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5506 ErrInfo = "Invalid dpp_ctrl value";
5507 return false;
5508 }
5509 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5511 ErrInfo = "Invalid dpp_ctrl value: "
5512 "wavefront shifts are not supported on GFX10+";
5513 return false;
5514 }
5515 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5517 ErrInfo = "Invalid dpp_ctrl value: "
5518 "broadcasts are not supported on GFX10+";
5519 return false;
5520 }
5521 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5523 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5524 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5525 !ST.hasGFX90AInsts()) {
5526 ErrInfo = "Invalid dpp_ctrl value: "
5527 "row_newbroadcast/row_share is not supported before "
5528 "GFX90A/GFX10";
5529 return false;
5530 }
5531 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5532 ErrInfo = "Invalid dpp_ctrl value: "
5533 "row_share and row_xmask are not supported before GFX10";
5534 return false;
5535 }
5536 }
5537
5538 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5541 ErrInfo = "Invalid dpp_ctrl value: "
5542 "DP ALU dpp only support row_newbcast";
5543 return false;
5544 }
5545 }
5546
5547 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5548 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5549 AMDGPU::OpName DataName =
5550 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5551 const MachineOperand *Data = getNamedOperand(MI, DataName);
5552 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5553 if (Data && !Data->isReg())
5554 Data = nullptr;
5555
5556 if (ST.hasGFX90AInsts()) {
5557 if (Dst && Data &&
5558 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5559 ErrInfo = "Invalid register class: "
5560 "vdata and vdst should be both VGPR or AGPR";
5561 return false;
5562 }
5563 if (Data && Data2 &&
5564 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5565 ErrInfo = "Invalid register class: "
5566 "both data operands should be VGPR or AGPR";
5567 return false;
5568 }
5569 } else {
5570 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5571 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5572 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5573 ErrInfo = "Invalid register class: "
5574 "agpr loads and stores not supported on this GPU";
5575 return false;
5576 }
5577 }
5578 }
5579
5580 if (ST.needsAlignedVGPRs()) {
5581 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5583 if (!Op)
5584 return true;
5585 Register Reg = Op->getReg();
5586 if (Reg.isPhysical())
5587 return !(RI.getHWRegIndex(Reg) & 1);
5588 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5589 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5590 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5591 };
5592
5593 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5594 Opcode == AMDGPU::DS_GWS_BARRIER) {
5595
5596 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5597 ErrInfo = "Subtarget requires even aligned vector registers "
5598 "for DS_GWS instructions";
5599 return false;
5600 }
5601 }
5602
5603 if (isMIMG(MI)) {
5604 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5605 ErrInfo = "Subtarget requires even aligned vector registers "
5606 "for vaddr operand of image instructions";
5607 return false;
5608 }
5609 }
5610 }
5611
5612 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5613 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5614 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5615 ErrInfo = "Invalid register class: "
5616 "v_accvgpr_write with an SGPR is not supported on this GPU";
5617 return false;
5618 }
5619 }
5620
5621 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5622 const MachineOperand &SrcOp = MI.getOperand(1);
5623 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5624 ErrInfo = "pseudo expects only physical SGPRs";
5625 return false;
5626 }
5627 }
5628
5629 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5630 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5631 if (!ST.hasScaleOffset()) {
5632 ErrInfo = "Subtarget does not support offset scaling";
5633 return false;
5634 }
5635 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5636 ErrInfo = "Instruction does not support offset scaling";
5637 return false;
5638 }
5639 }
5640 }
5641
5642 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5643 // information.
5644 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5645 for (unsigned I = 0; I < 3; ++I) {
5647 return false;
5648 }
5649 }
5650
5651 return true;
5652}
5653
5654// It is more readable to list mapped opcodes on the same line.
5655// clang-format off
5656
5658 switch (MI.getOpcode()) {
5659 default: return AMDGPU::INSTRUCTION_LIST_END;
5660 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5661 case AMDGPU::COPY: return AMDGPU::COPY;
5662 case AMDGPU::PHI: return AMDGPU::PHI;
5663 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5664 case AMDGPU::WQM: return AMDGPU::WQM;
5665 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5666 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5667 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5668 case AMDGPU::S_MOV_B32: {
5669 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5670 return MI.getOperand(1).isReg() ||
5671 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5672 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5673 }
5674 case AMDGPU::S_ADD_I32:
5675 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5676 case AMDGPU::S_ADDC_U32:
5677 return AMDGPU::V_ADDC_U32_e32;
5678 case AMDGPU::S_SUB_I32:
5679 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5680 // FIXME: These are not consistently handled, and selected when the carry is
5681 // used.
5682 case AMDGPU::S_ADD_U32:
5683 return AMDGPU::V_ADD_CO_U32_e32;
5684 case AMDGPU::S_SUB_U32:
5685 return AMDGPU::V_SUB_CO_U32_e32;
5686 case AMDGPU::S_ADD_U64_PSEUDO:
5687 return AMDGPU::V_ADD_U64_PSEUDO;
5688 case AMDGPU::S_SUB_U64_PSEUDO:
5689 return AMDGPU::V_SUB_U64_PSEUDO;
5690 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5691 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5692 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5693 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5694 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5695 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5696 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5697 case AMDGPU::S_XNOR_B32:
5698 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5699 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5700 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5701 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5702 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5703 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5704 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5705 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5706 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5707 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5708 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5709 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5710 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5711 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5712 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5713 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5714 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5715 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5716 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5717 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5718 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5719 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5720 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5721 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5722 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5723 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5724 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5725 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5726 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5727 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5728 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5729 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5730 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5731 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5732 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5733 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5734 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5735 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5736 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5737 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5738 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5739 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5740 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5741 case AMDGPU::S_CVT_F32_F16:
5742 case AMDGPU::S_CVT_HI_F32_F16:
5743 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5744 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5745 case AMDGPU::S_CVT_F16_F32:
5746 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5747 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5748 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5749 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5750 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5751 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5752 case AMDGPU::S_CEIL_F16:
5753 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5754 : AMDGPU::V_CEIL_F16_fake16_e64;
5755 case AMDGPU::S_FLOOR_F16:
5756 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5757 : AMDGPU::V_FLOOR_F16_fake16_e64;
5758 case AMDGPU::S_TRUNC_F16:
5759 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
5760 : AMDGPU::V_TRUNC_F16_fake16_e64;
5761 case AMDGPU::S_RNDNE_F16:
5762 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
5763 : AMDGPU::V_RNDNE_F16_fake16_e64;
5764 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5765 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5766 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5767 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5768 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5769 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5770 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5771 case AMDGPU::S_ADD_F16:
5772 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
5773 : AMDGPU::V_ADD_F16_fake16_e64;
5774 case AMDGPU::S_SUB_F16:
5775 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
5776 : AMDGPU::V_SUB_F16_fake16_e64;
5777 case AMDGPU::S_MIN_F16:
5778 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
5779 : AMDGPU::V_MIN_F16_fake16_e64;
5780 case AMDGPU::S_MAX_F16:
5781 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
5782 : AMDGPU::V_MAX_F16_fake16_e64;
5783 case AMDGPU::S_MINIMUM_F16:
5784 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5785 : AMDGPU::V_MINIMUM_F16_fake16_e64;
5786 case AMDGPU::S_MAXIMUM_F16:
5787 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5788 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
5789 case AMDGPU::S_MUL_F16:
5790 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
5791 : AMDGPU::V_MUL_F16_fake16_e64;
5792 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5793 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5794 case AMDGPU::S_FMAC_F16:
5795 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5796 : AMDGPU::V_FMAC_F16_fake16_e64;
5797 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5798 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5799 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5800 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5801 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5802 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5803 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5804 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5805 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5806 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5807 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5808 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5809 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5810 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5811 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5812 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5813 case AMDGPU::S_CMP_LT_F16:
5814 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5815 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5816 case AMDGPU::S_CMP_EQ_F16:
5817 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5818 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5819 case AMDGPU::S_CMP_LE_F16:
5820 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5821 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5822 case AMDGPU::S_CMP_GT_F16:
5823 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5824 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5825 case AMDGPU::S_CMP_LG_F16:
5826 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5827 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5828 case AMDGPU::S_CMP_GE_F16:
5829 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5830 : AMDGPU::V_CMP_GE_F16_fake16_e64;
5831 case AMDGPU::S_CMP_O_F16:
5832 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5833 : AMDGPU::V_CMP_O_F16_fake16_e64;
5834 case AMDGPU::S_CMP_U_F16:
5835 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5836 : AMDGPU::V_CMP_U_F16_fake16_e64;
5837 case AMDGPU::S_CMP_NGE_F16:
5838 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5839 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5840 case AMDGPU::S_CMP_NLG_F16:
5841 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5842 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5843 case AMDGPU::S_CMP_NGT_F16:
5844 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5845 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
5846 case AMDGPU::S_CMP_NLE_F16:
5847 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5848 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
5849 case AMDGPU::S_CMP_NEQ_F16:
5850 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5851 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
5852 case AMDGPU::S_CMP_NLT_F16:
5853 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5854 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
5855 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5856 case AMDGPU::V_S_EXP_F16_e64:
5857 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
5858 : AMDGPU::V_EXP_F16_fake16_e64;
5859 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5860 case AMDGPU::V_S_LOG_F16_e64:
5861 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
5862 : AMDGPU::V_LOG_F16_fake16_e64;
5863 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5864 case AMDGPU::V_S_RCP_F16_e64:
5865 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
5866 : AMDGPU::V_RCP_F16_fake16_e64;
5867 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5868 case AMDGPU::V_S_RSQ_F16_e64:
5869 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
5870 : AMDGPU::V_RSQ_F16_fake16_e64;
5871 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5872 case AMDGPU::V_S_SQRT_F16_e64:
5873 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
5874 : AMDGPU::V_SQRT_F16_fake16_e64;
5875 }
5877 "Unexpected scalar opcode without corresponding vector one!");
5878}
5879
5880// clang-format on
5881
5885 const DebugLoc &DL, Register Reg,
5886 bool IsSCCLive,
5887 SlotIndexes *Indexes) const {
5888 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5889 const SIInstrInfo *TII = ST.getInstrInfo();
5890 bool IsWave32 = ST.isWave32();
5891 if (IsSCCLive) {
5892 // Insert two move instructions, one to save the original value of EXEC and
5893 // the other to turn on all bits in EXEC. This is required as we can't use
5894 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5895 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5896 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5897 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5898 .addReg(Exec, RegState::Kill);
5899 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5900 if (Indexes) {
5901 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5902 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5903 }
5904 } else {
5905 const unsigned OrSaveExec =
5906 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5907 auto SaveExec =
5908 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5909 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5910 if (Indexes)
5911 Indexes->insertMachineInstrInMaps(*SaveExec);
5912 }
5913}
5914
5917 const DebugLoc &DL, Register Reg,
5918 SlotIndexes *Indexes) const {
5919 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5920 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5921 auto ExecRestoreMI =
5922 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5923 if (Indexes)
5924 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5925}
5926
5930 "Not a whole wave func");
5931 MachineBasicBlock &MBB = *MF.begin();
5932 for (MachineInstr &MI : MBB)
5933 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
5934 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
5935 return &MI;
5936
5937 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
5938}
5939
5940static const TargetRegisterClass *
5942 const MCInstrDesc &TID, unsigned RCID,
5943 bool IsAllocatable) {
5944 if ((IsAllocatable || !ST.hasGFX90AInsts()) &&
5945 (((TID.mayLoad() || TID.mayStore()) &&
5946 !(TID.TSFlags & SIInstrFlags::Spill)) ||
5948 switch (RCID) {
5949 case AMDGPU::AV_32RegClassID:
5950 RCID = AMDGPU::VGPR_32RegClassID;
5951 break;
5952 case AMDGPU::AV_64RegClassID:
5953 RCID = AMDGPU::VReg_64RegClassID;
5954 break;
5955 case AMDGPU::AV_96RegClassID:
5956 RCID = AMDGPU::VReg_96RegClassID;
5957 break;
5958 case AMDGPU::AV_128RegClassID:
5959 RCID = AMDGPU::VReg_128RegClassID;
5960 break;
5961 case AMDGPU::AV_160RegClassID:
5962 RCID = AMDGPU::VReg_160RegClassID;
5963 break;
5964 case AMDGPU::AV_512RegClassID:
5965 RCID = AMDGPU::VReg_512RegClassID;
5966 break;
5967 default:
5968 break;
5969 }
5970 }
5971
5972 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
5973}
5974
5976 unsigned OpNum, const TargetRegisterInfo *TRI,
5977 const MachineFunction &MF)
5978 const {
5979 if (OpNum >= TID.getNumOperands())
5980 return nullptr;
5981 auto RegClass = TID.operands()[OpNum].RegClass;
5982 bool IsAllocatable = false;
5984 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
5985 // with two data operands. Request register class constrained to VGPR only
5986 // of both operands present as Machine Copy Propagation can not check this
5987 // constraint and possibly other passes too.
5988 //
5989 // The check is limited to FLAT and DS because atomics in non-flat encoding
5990 // have their vdst and vdata tied to be the same register.
5991 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5992 AMDGPU::OpName::vdst);
5993 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5994 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
5995 : AMDGPU::OpName::vdata);
5996 if (DataIdx != -1) {
5997 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
5998 TID.Opcode, AMDGPU::OpName::data1);
5999 }
6000 }
6001 return adjustAllocatableRegClass(ST, RI, TID, RegClass, IsAllocatable);
6002}
6003
6005 unsigned OpNo) const {
6006 const MCInstrDesc &Desc = get(MI.getOpcode());
6007 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6008 Desc.operands()[OpNo].RegClass == -1) {
6009 Register Reg = MI.getOperand(OpNo).getReg();
6010
6011 if (Reg.isVirtual()) {
6012 const MachineRegisterInfo &MRI =
6013 MI.getParent()->getParent()->getRegInfo();
6014 return MRI.getRegClass(Reg);
6015 }
6016 return RI.getPhysRegBaseClass(Reg);
6017 }
6018
6019 unsigned RCID = Desc.operands()[OpNo].RegClass;
6020 return adjustAllocatableRegClass(ST, RI, Desc, RCID, true);
6021}
6022
6025 MachineBasicBlock *MBB = MI.getParent();
6026 MachineOperand &MO = MI.getOperand(OpIdx);
6028 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
6029 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6030 unsigned Size = RI.getRegSizeInBits(*RC);
6031 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6032 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6033 : AMDGPU::V_MOV_B32_e32;
6034 if (MO.isReg())
6035 Opcode = AMDGPU::COPY;
6036 else if (RI.isSGPRClass(RC))
6037 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6038
6039 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6040 Register Reg = MRI.createVirtualRegister(VRC);
6042 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6043 MO.ChangeToRegister(Reg, false);
6044}
6045
6048 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6049 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6050 if (!SuperReg.getReg().isVirtual())
6051 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6052
6053 MachineBasicBlock *MBB = MI->getParent();
6054 const DebugLoc &DL = MI->getDebugLoc();
6055 Register SubReg = MRI.createVirtualRegister(SubRC);
6056
6057 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6058 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6059 .addReg(SuperReg.getReg(), 0, NewSubIdx);
6060 return SubReg;
6061}
6062
6065 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6066 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6067 if (Op.isImm()) {
6068 if (SubIdx == AMDGPU::sub0)
6069 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6070 if (SubIdx == AMDGPU::sub1)
6071 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6072
6073 llvm_unreachable("Unhandled register index for immediate");
6074 }
6075
6076 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6077 SubIdx, SubRC);
6078 return MachineOperand::CreateReg(SubReg, false);
6079}
6080
6081// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6082void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6083 assert(Inst.getNumExplicitOperands() == 3);
6084 MachineOperand Op1 = Inst.getOperand(1);
6085 Inst.removeOperand(1);
6086 Inst.addOperand(Op1);
6087}
6088
6090 const MCOperandInfo &OpInfo,
6091 const MachineOperand &MO) const {
6092 if (!MO.isReg())
6093 return false;
6094
6095 Register Reg = MO.getReg();
6096
6097 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
6098 if (Reg.isPhysical())
6099 return DRC->contains(Reg);
6100
6101 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6102
6103 if (MO.getSubReg()) {
6104 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
6105 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6106 if (!SuperRC)
6107 return false;
6108
6109 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
6110 if (!DRC)
6111 return false;
6112 }
6113 return RC->hasSuperClassEq(DRC);
6114}
6115
6117 const MachineOperand &MO) const {
6118 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6119 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6120 unsigned Opc = MI.getOpcode();
6121
6122 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6123 // information.
6124 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6125 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6126 constexpr const AMDGPU::OpName OpNames[] = {
6127 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6128
6129 for (auto [I, OpName] : enumerate(OpNames)) {
6130 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6131 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6133 return false;
6134 }
6135 }
6136
6137 if (!isLegalRegOperand(MRI, OpInfo, MO))
6138 return false;
6139
6140 // check Accumulate GPR operand
6141 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6142 if (IsAGPR && !ST.hasMAIInsts())
6143 return false;
6144 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6145 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6146 return false;
6147 // Atomics should have both vdst and vdata either vgpr or agpr.
6148 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6149 const int DataIdx = AMDGPU::getNamedOperandIdx(
6150 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6151 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6152 MI.getOperand(DataIdx).isReg() &&
6153 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6154 return false;
6155 if ((int)OpIdx == DataIdx) {
6156 if (VDstIdx != -1 &&
6157 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6158 return false;
6159 // DS instructions with 2 src operands also must have tied RC.
6160 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6161 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6162 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6163 return false;
6164 }
6165
6166 // Check V_ACCVGPR_WRITE_B32_e64
6167 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6168 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6169 RI.isSGPRReg(MRI, MO.getReg()))
6170 return false;
6171 return true;
6172}
6173
6175 const MCOperandInfo &OpInfo,
6176 const MachineOperand &MO) const {
6177 if (MO.isReg())
6178 return isLegalRegOperand(MRI, OpInfo, MO);
6179
6180 // Handle non-register types that are treated like immediates.
6181 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6182 return true;
6183}
6184
6186 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6187 const MachineOperand *MO) const {
6188 constexpr const unsigned NumOps = 3;
6189 constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
6190 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6191 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6192 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6193
6194 assert(SrcN < NumOps);
6195
6196 if (!MO) {
6197 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6198 if (SrcIdx == -1)
6199 return true;
6200 MO = &MI.getOperand(SrcIdx);
6201 }
6202
6203 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6204 return true;
6205
6206 int ModsIdx =
6207 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6208 if (ModsIdx == -1)
6209 return true;
6210
6211 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6212 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6213 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6214
6215 return !OpSel && !OpSelHi;
6216}
6217
6219 const MachineOperand *MO) const {
6220 const MachineFunction &MF = *MI.getParent()->getParent();
6221 const MachineRegisterInfo &MRI = MF.getRegInfo();
6222 const MCInstrDesc &InstDesc = MI.getDesc();
6223 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6224 const TargetRegisterClass *DefinedRC =
6225 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
6226 if (!MO)
6227 MO = &MI.getOperand(OpIdx);
6228
6229 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6230
6231 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6232 const MachineOperand *UsedLiteral = nullptr;
6233
6234 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6235 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6236
6237 // TODO: Be more permissive with frame indexes.
6238 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6239 if (!LiteralLimit--)
6240 return false;
6241
6242 UsedLiteral = MO;
6243 }
6244
6246 if (MO->isReg())
6247 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6248
6249 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6250 if (i == OpIdx)
6251 continue;
6252 const MachineOperand &Op = MI.getOperand(i);
6253 if (Op.isReg()) {
6254 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6255 if (!SGPRsUsed.count(SGPR) &&
6256 // FIXME: This can access off the end of the operands() array.
6257 usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) {
6258 if (--ConstantBusLimit <= 0)
6259 return false;
6260 SGPRsUsed.insert(SGPR);
6261 }
6262 } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
6263 !isInlineConstant(Op, InstDesc.operands()[i])) {
6264 // The same literal may be used multiple times.
6265 if (!UsedLiteral)
6266 UsedLiteral = &Op;
6267 else if (UsedLiteral->isIdenticalTo(Op))
6268 continue;
6269
6270 if (!LiteralLimit--)
6271 return false;
6272 if (--ConstantBusLimit <= 0)
6273 return false;
6274 }
6275 }
6276 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6277 // There can be at most one literal operand, but it can be repeated.
6278 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6279 if (i == OpIdx)
6280 continue;
6281 const MachineOperand &Op = MI.getOperand(i);
6282 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6283 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6284 !Op.isIdenticalTo(*MO))
6285 return false;
6286
6287 // Do not fold a non-inlineable and non-register operand into an
6288 // instruction that already has a frame index. The frame index handling
6289 // code could not handle well when a frame index co-exists with another
6290 // non-register operand, unless that operand is an inlineable immediate.
6291 if (Op.isFI())
6292 return false;
6293 }
6294 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6295 isF16PseudoScalarTrans(MI.getOpcode())) {
6296 return false;
6297 }
6298
6299 if (MO->isReg()) {
6300 if (!DefinedRC)
6301 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6302 return isLegalRegOperand(MI, OpIdx, *MO);
6303 }
6304
6305 if (MO->isImm()) {
6306 uint64_t Imm = MO->getImm();
6307 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6308 bool Is64BitOp = Is64BitFPOp ||
6312 if (Is64BitOp &&
6314 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6315 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6316 return false;
6317
6318 // FIXME: We can use sign extended 64-bit literals, but only for signed
6319 // operands. At the moment we do not know if an operand is signed.
6320 // Such operand will be encoded as its low 32 bits and then either
6321 // correctly sign extended or incorrectly zero extended by HW.
6322 // If 64-bit literals are supported and the literal will be encoded
6323 // as full 64 bit we still can use it.
6324 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6325 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6326 return false;
6327 }
6328 }
6329
6330 // Handle non-register types that are treated like immediates.
6331 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6332
6333 if (!DefinedRC) {
6334 // This operand expects an immediate.
6335 return true;
6336 }
6337
6338 return isImmOperandLegal(MI, OpIdx, *MO);
6339}
6340
6342 MachineInstr &MI) const {
6343 unsigned Opc = MI.getOpcode();
6344 const MCInstrDesc &InstrDesc = get(Opc);
6345
6346 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6347 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6348
6349 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6350 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6351
6352 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6353 // we need to only have one constant bus use before GFX10.
6354 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6355 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6356 RI.isSGPRReg(MRI, Src0.getReg()))
6357 legalizeOpWithMove(MI, Src0Idx);
6358
6359 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6360 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6361 // src0/src1 with V_READFIRSTLANE.
6362 if (Opc == AMDGPU::V_WRITELANE_B32) {
6363 const DebugLoc &DL = MI.getDebugLoc();
6364 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6365 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6366 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6367 .add(Src0);
6368 Src0.ChangeToRegister(Reg, false);
6369 }
6370 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6371 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6372 const DebugLoc &DL = MI.getDebugLoc();
6373 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6374 .add(Src1);
6375 Src1.ChangeToRegister(Reg, false);
6376 }
6377 return;
6378 }
6379
6380 // No VOP2 instructions support AGPRs.
6381 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
6382 legalizeOpWithMove(MI, Src0Idx);
6383
6384 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
6385 legalizeOpWithMove(MI, Src1Idx);
6386
6387 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6388 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6389 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6390 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6391 legalizeOpWithMove(MI, Src2Idx);
6392 }
6393
6394 // VOP2 src0 instructions support all operand types, so we don't need to check
6395 // their legality. If src1 is already legal, we don't need to do anything.
6396 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6397 return;
6398
6399 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6400 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6401 // select is uniform.
6402 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6403 RI.isVGPR(MRI, Src1.getReg())) {
6404 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6405 const DebugLoc &DL = MI.getDebugLoc();
6406 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6407 .add(Src1);
6408 Src1.ChangeToRegister(Reg, false);
6409 return;
6410 }
6411
6412 // We do not use commuteInstruction here because it is too aggressive and will
6413 // commute if it is possible. We only want to commute here if it improves
6414 // legality. This can be called a fairly large number of times so don't waste
6415 // compile time pointlessly swapping and checking legality again.
6416 if (HasImplicitSGPR || !MI.isCommutable()) {
6417 legalizeOpWithMove(MI, Src1Idx);
6418 return;
6419 }
6420
6421 // If src0 can be used as src1, commuting will make the operands legal.
6422 // Otherwise we have to give up and insert a move.
6423 //
6424 // TODO: Other immediate-like operand kinds could be commuted if there was a
6425 // MachineOperand::ChangeTo* for them.
6426 if ((!Src1.isImm() && !Src1.isReg()) ||
6427 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6428 legalizeOpWithMove(MI, Src1Idx);
6429 return;
6430 }
6431
6432 int CommutedOpc = commuteOpcode(MI);
6433 if (CommutedOpc == -1) {
6434 legalizeOpWithMove(MI, Src1Idx);
6435 return;
6436 }
6437
6438 MI.setDesc(get(CommutedOpc));
6439
6440 Register Src0Reg = Src0.getReg();
6441 unsigned Src0SubReg = Src0.getSubReg();
6442 bool Src0Kill = Src0.isKill();
6443
6444 if (Src1.isImm())
6445 Src0.ChangeToImmediate(Src1.getImm());
6446 else if (Src1.isReg()) {
6447 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6448 Src0.setSubReg(Src1.getSubReg());
6449 } else
6450 llvm_unreachable("Should only have register or immediate operands");
6451
6452 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6453 Src1.setSubReg(Src0SubReg);
6455}
6456
6457// Legalize VOP3 operands. All operand types are supported for any operand
6458// but only one literal constant and only starting from GFX10.
6460 MachineInstr &MI) const {
6461 unsigned Opc = MI.getOpcode();
6462
6463 int VOP3Idx[3] = {
6464 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6465 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6466 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6467 };
6468
6469 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6470 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6471 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6472 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6473 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6474 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6475 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6476 // src1 and src2 must be scalar
6477 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6478 const DebugLoc &DL = MI.getDebugLoc();
6479 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6480 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6481 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6482 .add(Src1);
6483 Src1.ChangeToRegister(Reg, false);
6484 }
6485 if (VOP3Idx[2] != -1) {
6486 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6487 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6488 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6489 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6490 .add(Src2);
6491 Src2.ChangeToRegister(Reg, false);
6492 }
6493 }
6494 }
6495
6496 // Find the one SGPR operand we are allowed to use.
6497 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6498 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6499 SmallDenseSet<unsigned> SGPRsUsed;
6500 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6501 if (SGPRReg) {
6502 SGPRsUsed.insert(SGPRReg);
6503 --ConstantBusLimit;
6504 }
6505
6506 for (int Idx : VOP3Idx) {
6507 if (Idx == -1)
6508 break;
6509 MachineOperand &MO = MI.getOperand(Idx);
6510
6511 if (!MO.isReg()) {
6512 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6513 continue;
6514
6515 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6516 --LiteralLimit;
6517 --ConstantBusLimit;
6518 continue;
6519 }
6520
6521 --LiteralLimit;
6522 --ConstantBusLimit;
6524 continue;
6525 }
6526
6527 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
6528 !isOperandLegal(MI, Idx, &MO)) {
6530 continue;
6531 }
6532
6533 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6534 continue; // VGPRs are legal
6535
6536 // We can use one SGPR in each VOP3 instruction prior to GFX10
6537 // and two starting from GFX10.
6538 if (SGPRsUsed.count(MO.getReg()))
6539 continue;
6540 if (ConstantBusLimit > 0) {
6541 SGPRsUsed.insert(MO.getReg());
6542 --ConstantBusLimit;
6543 continue;
6544 }
6545
6546 // If we make it this far, then the operand is not legal and we must
6547 // legalize it.
6549 }
6550
6551 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6552 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6553 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6554 legalizeOpWithMove(MI, VOP3Idx[2]);
6555
6556 // Fix the register class of packed FP32 instructions on gfx12+. See
6557 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6559 for (unsigned I = 0; I < 3; ++I) {
6561 legalizeOpWithMove(MI, VOP3Idx[I]);
6562 }
6563 }
6564}
6565
6568 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6569 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6570 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6571 if (DstRC)
6572 SRC = RI.getCommonSubClass(SRC, DstRC);
6573
6574 Register DstReg = MRI.createVirtualRegister(SRC);
6575 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6576
6577 if (RI.hasAGPRs(VRC)) {
6578 VRC = RI.getEquivalentVGPRClass(VRC);
6579 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6580 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6581 get(TargetOpcode::COPY), NewSrcReg)
6582 .addReg(SrcReg);
6583 SrcReg = NewSrcReg;
6584 }
6585
6586 if (SubRegs == 1) {
6587 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6588 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6589 .addReg(SrcReg);
6590 return DstReg;
6591 }
6592
6594 for (unsigned i = 0; i < SubRegs; ++i) {
6595 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6596 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6597 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6598 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6599 SRegs.push_back(SGPR);
6600 }
6601
6603 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6604 get(AMDGPU::REG_SEQUENCE), DstReg);
6605 for (unsigned i = 0; i < SubRegs; ++i) {
6606 MIB.addReg(SRegs[i]);
6607 MIB.addImm(RI.getSubRegFromChannel(i));
6608 }
6609 return DstReg;
6610}
6611
6613 MachineInstr &MI) const {
6614
6615 // If the pointer is store in VGPRs, then we need to move them to
6616 // SGPRs using v_readfirstlane. This is safe because we only select
6617 // loads with uniform pointers to SMRD instruction so we know the
6618 // pointer value is uniform.
6619 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6620 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6621 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6622 SBase->setReg(SGPR);
6623 }
6624 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6625 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6626 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6627 SOff->setReg(SGPR);
6628 }
6629}
6630
6632 unsigned Opc = Inst.getOpcode();
6633 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6634 if (OldSAddrIdx < 0)
6635 return false;
6636
6637 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6638
6639 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6640 if (NewOpc < 0)
6642 if (NewOpc < 0)
6643 return false;
6644
6646 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6647 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6648 return false;
6649
6650 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6651 if (NewVAddrIdx < 0)
6652 return false;
6653
6654 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6655
6656 // Check vaddr, it shall be zero or absent.
6657 MachineInstr *VAddrDef = nullptr;
6658 if (OldVAddrIdx >= 0) {
6659 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6660 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6661 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6662 !VAddrDef->getOperand(1).isImm() ||
6663 VAddrDef->getOperand(1).getImm() != 0)
6664 return false;
6665 }
6666
6667 const MCInstrDesc &NewDesc = get(NewOpc);
6668 Inst.setDesc(NewDesc);
6669
6670 // Callers expect iterator to be valid after this call, so modify the
6671 // instruction in place.
6672 if (OldVAddrIdx == NewVAddrIdx) {
6673 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6674 // Clear use list from the old vaddr holding a zero register.
6675 MRI.removeRegOperandFromUseList(&NewVAddr);
6676 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6677 Inst.removeOperand(OldSAddrIdx);
6678 // Update the use list with the pointer we have just moved from vaddr to
6679 // saddr position. Otherwise new vaddr will be missing from the use list.
6680 MRI.removeRegOperandFromUseList(&NewVAddr);
6681 MRI.addRegOperandToUseList(&NewVAddr);
6682 } else {
6683 assert(OldSAddrIdx == NewVAddrIdx);
6684
6685 if (OldVAddrIdx >= 0) {
6686 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6687 AMDGPU::OpName::vdst_in);
6688
6689 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6690 // it asserts. Untie the operands for now and retie them afterwards.
6691 if (NewVDstIn != -1) {
6692 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6693 Inst.untieRegOperand(OldVDstIn);
6694 }
6695
6696 Inst.removeOperand(OldVAddrIdx);
6697
6698 if (NewVDstIn != -1) {
6699 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6700 Inst.tieOperands(NewVDst, NewVDstIn);
6701 }
6702 }
6703 }
6704
6705 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6706 VAddrDef->eraseFromParent();
6707
6708 return true;
6709}
6710
6711// FIXME: Remove this when SelectionDAG is obsoleted.
6713 MachineInstr &MI) const {
6715 return;
6716
6717 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6718 // thinks they are uniform, so a readfirstlane should be valid.
6719 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6720 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6721 return;
6722
6724 return;
6725
6726 const TargetRegisterClass *DeclaredRC = getRegClass(
6727 MI.getDesc(), SAddr->getOperandNo(), &RI, *MI.getParent()->getParent());
6728
6729 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6730 SAddr->setReg(ToSGPR);
6731}
6732
6735 const TargetRegisterClass *DstRC,
6738 const DebugLoc &DL) const {
6739 Register OpReg = Op.getReg();
6740 unsigned OpSubReg = Op.getSubReg();
6741
6742 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6743 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6744
6745 // Check if operand is already the correct register class.
6746 if (DstRC == OpRC)
6747 return;
6748
6749 Register DstReg = MRI.createVirtualRegister(DstRC);
6750 auto Copy =
6751 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6752 Op.setReg(DstReg);
6753
6754 MachineInstr *Def = MRI.getVRegDef(OpReg);
6755 if (!Def)
6756 return;
6757
6758 // Try to eliminate the copy if it is copying an immediate value.
6759 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6760 foldImmediate(*Copy, *Def, OpReg, &MRI);
6761
6762 bool ImpDef = Def->isImplicitDef();
6763 while (!ImpDef && Def && Def->isCopy()) {
6764 if (Def->getOperand(1).getReg().isPhysical())
6765 break;
6766 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6767 ImpDef = Def && Def->isImplicitDef();
6768 }
6769 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6770 !ImpDef)
6771 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6772}
6773
6774// Emit the actual waterfall loop, executing the wrapped instruction for each
6775// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6776// iteration, in the worst case we execute 64 (once per lane).
6777static void
6780 MachineBasicBlock &LoopBB,
6781 MachineBasicBlock &BodyBB,
6782 const DebugLoc &DL,
6783 ArrayRef<MachineOperand *> ScalarOps) {
6784 MachineFunction &MF = *LoopBB.getParent();
6785 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6786 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6787 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6788 unsigned SaveExecOpc =
6789 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6790 unsigned XorTermOpc =
6791 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6792 unsigned AndOpc =
6793 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6794 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6795
6797 Register CondReg;
6798
6799 for (MachineOperand *ScalarOp : ScalarOps) {
6800 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6801 unsigned NumSubRegs = RegSize / 32;
6802 Register VScalarOp = ScalarOp->getReg();
6803
6804 if (NumSubRegs == 1) {
6805 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6806
6807 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6808 .addReg(VScalarOp);
6809
6810 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6811
6812 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6813 .addReg(CurReg)
6814 .addReg(VScalarOp);
6815
6816 // Combine the comparison results with AND.
6817 if (!CondReg) // First.
6818 CondReg = NewCondReg;
6819 else { // If not the first, we create an AND.
6820 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6821 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6822 .addReg(CondReg)
6823 .addReg(NewCondReg);
6824 CondReg = AndReg;
6825 }
6826
6827 // Update ScalarOp operand to use the SGPR ScalarOp.
6828 ScalarOp->setReg(CurReg);
6829 ScalarOp->setIsKill();
6830 } else {
6831 SmallVector<Register, 8> ReadlanePieces;
6832 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6833 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6834 "Unhandled register size");
6835
6836 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6837 Register CurRegLo =
6838 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6839 Register CurRegHi =
6840 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6841
6842 // Read the next variant <- also loop target.
6843 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6844 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6845
6846 // Read the next variant <- also loop target.
6847 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6848 .addReg(VScalarOp, VScalarOpUndef,
6849 TRI->getSubRegFromChannel(Idx + 1));
6850
6851 ReadlanePieces.push_back(CurRegLo);
6852 ReadlanePieces.push_back(CurRegHi);
6853
6854 // Comparison is to be done as 64-bit.
6855 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6856 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6857 .addReg(CurRegLo)
6858 .addImm(AMDGPU::sub0)
6859 .addReg(CurRegHi)
6860 .addImm(AMDGPU::sub1);
6861
6862 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6863 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6864 NewCondReg)
6865 .addReg(CurReg);
6866 if (NumSubRegs <= 2)
6867 Cmp.addReg(VScalarOp);
6868 else
6869 Cmp.addReg(VScalarOp, VScalarOpUndef,
6870 TRI->getSubRegFromChannel(Idx, 2));
6871
6872 // Combine the comparison results with AND.
6873 if (!CondReg) // First.
6874 CondReg = NewCondReg;
6875 else { // If not the first, we create an AND.
6876 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6877 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6878 .addReg(CondReg)
6879 .addReg(NewCondReg);
6880 CondReg = AndReg;
6881 }
6882 } // End for loop.
6883
6884 const auto *SScalarOpRC =
6885 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6886 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6887
6888 // Build scalar ScalarOp.
6889 auto Merge =
6890 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6891 unsigned Channel = 0;
6892 for (Register Piece : ReadlanePieces) {
6893 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6894 }
6895
6896 // Update ScalarOp operand to use the SGPR ScalarOp.
6897 ScalarOp->setReg(SScalarOp);
6898 ScalarOp->setIsKill();
6899 }
6900 }
6901
6902 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6903 MRI.setSimpleHint(SaveExec, CondReg);
6904
6905 // Update EXEC to matching lanes, saving original to SaveExec.
6906 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6907 .addReg(CondReg, RegState::Kill);
6908
6909 // The original instruction is here; we insert the terminators after it.
6910 I = BodyBB.end();
6911
6912 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6913 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6914 .addReg(Exec)
6915 .addReg(SaveExec);
6916
6917 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6918}
6919
6920// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6921// with SGPRs by iterating over all unique values across all lanes.
6922// Returns the loop basic block that now contains \p MI.
6923static MachineBasicBlock *
6927 MachineBasicBlock::iterator Begin = nullptr,
6928 MachineBasicBlock::iterator End = nullptr) {
6929 MachineBasicBlock &MBB = *MI.getParent();
6930 MachineFunction &MF = *MBB.getParent();
6931 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6932 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6934 if (!Begin.isValid())
6935 Begin = &MI;
6936 if (!End.isValid()) {
6937 End = &MI;
6938 ++End;
6939 }
6940 const DebugLoc &DL = MI.getDebugLoc();
6941 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6942 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6943 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6944
6945 // Save SCC. Waterfall Loop may overwrite SCC.
6946 Register SaveSCCReg;
6947
6948 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
6949 // rather than unlimited scan everywhere
6950 bool SCCNotDead =
6951 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
6952 std::numeric_limits<unsigned>::max()) !=
6954 if (SCCNotDead) {
6955 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6956 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6957 .addImm(1)
6958 .addImm(0);
6959 }
6960
6961 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6962
6963 // Save the EXEC mask
6964 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
6965
6966 // Killed uses in the instruction we are waterfalling around will be
6967 // incorrect due to the added control-flow.
6969 ++AfterMI;
6970 for (auto I = Begin; I != AfterMI; I++) {
6971 for (auto &MO : I->all_uses())
6972 MRI.clearKillFlags(MO.getReg());
6973 }
6974
6975 // To insert the loop we need to split the block. Move everything after this
6976 // point to a new block, and insert a new empty block between the two.
6979 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6981 ++MBBI;
6982
6983 MF.insert(MBBI, LoopBB);
6984 MF.insert(MBBI, BodyBB);
6985 MF.insert(MBBI, RemainderBB);
6986
6987 LoopBB->addSuccessor(BodyBB);
6988 BodyBB->addSuccessor(LoopBB);
6989 BodyBB->addSuccessor(RemainderBB);
6990
6991 // Move Begin to MI to the BodyBB, and the remainder of the block to
6992 // RemainderBB.
6993 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
6994 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
6995 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
6996
6997 MBB.addSuccessor(LoopBB);
6998
6999 // Update dominators. We know that MBB immediately dominates LoopBB, that
7000 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7001 // RemainderBB. RemainderBB immediately dominates all of the successors
7002 // transferred to it from MBB that MBB used to properly dominate.
7003 if (MDT) {
7004 MDT->addNewBlock(LoopBB, &MBB);
7005 MDT->addNewBlock(BodyBB, LoopBB);
7006 MDT->addNewBlock(RemainderBB, BodyBB);
7007 for (auto &Succ : RemainderBB->successors()) {
7008 if (MDT->properlyDominates(&MBB, Succ)) {
7009 MDT->changeImmediateDominator(Succ, RemainderBB);
7010 }
7011 }
7012 }
7013
7014 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7015
7016 MachineBasicBlock::iterator First = RemainderBB->begin();
7017 // Restore SCC
7018 if (SCCNotDead) {
7019 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7020 .addReg(SaveSCCReg, RegState::Kill)
7021 .addImm(0);
7022 }
7023
7024 // Restore the EXEC mask
7025 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
7026 return BodyBB;
7027}
7028
7029// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7030static std::tuple<unsigned, unsigned>
7032 MachineBasicBlock &MBB = *MI.getParent();
7033 MachineFunction &MF = *MBB.getParent();
7035
7036 // Extract the ptr from the resource descriptor.
7037 unsigned RsrcPtr =
7038 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7039 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7040
7041 // Create an empty resource descriptor
7042 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7043 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7044 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7045 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7046 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7047
7048 // Zero64 = 0
7049 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7050 .addImm(0);
7051
7052 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7053 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7054 .addImm(Lo_32(RsrcDataFormat));
7055
7056 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7057 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7058 .addImm(Hi_32(RsrcDataFormat));
7059
7060 // NewSRsrc = {Zero64, SRsrcFormat}
7061 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7062 .addReg(Zero64)
7063 .addImm(AMDGPU::sub0_sub1)
7064 .addReg(SRsrcFormatLo)
7065 .addImm(AMDGPU::sub2)
7066 .addReg(SRsrcFormatHi)
7067 .addImm(AMDGPU::sub3);
7068
7069 return std::tuple(RsrcPtr, NewSRsrc);
7070}
7071
7074 MachineDominatorTree *MDT) const {
7075 MachineFunction &MF = *MI.getParent()->getParent();
7077 MachineBasicBlock *CreatedBB = nullptr;
7078
7079 // Legalize VOP2
7080 if (isVOP2(MI) || isVOPC(MI)) {
7082 return CreatedBB;
7083 }
7084
7085 // Legalize VOP3
7086 if (isVOP3(MI)) {
7088 return CreatedBB;
7089 }
7090
7091 // Legalize SMRD
7092 if (isSMRD(MI)) {
7094 return CreatedBB;
7095 }
7096
7097 // Legalize FLAT
7098 if (isFLAT(MI)) {
7100 return CreatedBB;
7101 }
7102
7103 // Legalize REG_SEQUENCE and PHI
7104 // The register class of the operands much be the same type as the register
7105 // class of the output.
7106 if (MI.getOpcode() == AMDGPU::PHI) {
7107 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
7108 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
7109 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
7110 continue;
7111 const TargetRegisterClass *OpRC =
7112 MRI.getRegClass(MI.getOperand(i).getReg());
7113 if (RI.hasVectorRegisters(OpRC)) {
7114 VRC = OpRC;
7115 } else {
7116 SRC = OpRC;
7117 }
7118 }
7119
7120 // If any of the operands are VGPR registers, then they all most be
7121 // otherwise we will create illegal VGPR->SGPR copies when legalizing
7122 // them.
7123 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
7124 if (!VRC) {
7125 assert(SRC);
7126 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
7127 VRC = &AMDGPU::VReg_1RegClass;
7128 } else
7129 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7130 ? RI.getEquivalentAGPRClass(SRC)
7131 : RI.getEquivalentVGPRClass(SRC);
7132 } else {
7133 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7134 ? RI.getEquivalentAGPRClass(VRC)
7135 : RI.getEquivalentVGPRClass(VRC);
7136 }
7137 RC = VRC;
7138 } else {
7139 RC = SRC;
7140 }
7141
7142 // Update all the operands so they have the same type.
7143 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7144 MachineOperand &Op = MI.getOperand(I);
7145 if (!Op.isReg() || !Op.getReg().isVirtual())
7146 continue;
7147
7148 // MI is a PHI instruction.
7149 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7151
7152 // Avoid creating no-op copies with the same src and dst reg class. These
7153 // confuse some of the machine passes.
7154 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
7155 }
7156 }
7157
7158 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7159 // VGPR dest type and SGPR sources, insert copies so all operands are
7160 // VGPRs. This seems to help operand folding / the register coalescer.
7161 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7162 MachineBasicBlock *MBB = MI.getParent();
7163 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7164 if (RI.hasVGPRs(DstRC)) {
7165 // Update all the operands so they are VGPR register classes. These may
7166 // not be the same register class because REG_SEQUENCE supports mixing
7167 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7168 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7169 MachineOperand &Op = MI.getOperand(I);
7170 if (!Op.isReg() || !Op.getReg().isVirtual())
7171 continue;
7172
7173 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7174 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7175 if (VRC == OpRC)
7176 continue;
7177
7178 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7179 Op.setIsKill();
7180 }
7181 }
7182
7183 return CreatedBB;
7184 }
7185
7186 // Legalize INSERT_SUBREG
7187 // src0 must have the same register class as dst
7188 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7189 Register Dst = MI.getOperand(0).getReg();
7190 Register Src0 = MI.getOperand(1).getReg();
7191 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7192 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7193 if (DstRC != Src0RC) {
7194 MachineBasicBlock *MBB = MI.getParent();
7195 MachineOperand &Op = MI.getOperand(1);
7196 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7197 }
7198 return CreatedBB;
7199 }
7200
7201 // Legalize SI_INIT_M0
7202 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7203 MachineOperand &Src = MI.getOperand(0);
7204 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7205 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7206 return CreatedBB;
7207 }
7208
7209 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7210 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7211 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7212 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7213 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7214 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7215 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7216 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7217 MachineOperand &Src = MI.getOperand(1);
7218 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7219 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7220 return CreatedBB;
7221 }
7222
7223 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7224 //
7225 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7226 // scratch memory access. In both cases, the legalization never involves
7227 // conversion to the addr64 form.
7229 (isMUBUF(MI) || isMTBUF(MI)))) {
7230 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7231 ? AMDGPU::OpName::rsrc
7232 : AMDGPU::OpName::srsrc;
7233 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7234 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7235 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7236
7237 AMDGPU::OpName SampOpName =
7238 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7239 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7240 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7241 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7242
7243 return CreatedBB;
7244 }
7245
7246 // Legalize SI_CALL
7247 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7248 MachineOperand *Dest = &MI.getOperand(0);
7249 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7250 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7251 // following copies, we also need to move copies from and to physical
7252 // registers into the loop block.
7253 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7254 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7255
7256 // Also move the copies to physical registers into the loop block
7257 MachineBasicBlock &MBB = *MI.getParent();
7259 while (Start->getOpcode() != FrameSetupOpcode)
7260 --Start;
7262 while (End->getOpcode() != FrameDestroyOpcode)
7263 ++End;
7264 // Also include following copies of the return value
7265 ++End;
7266 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7267 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7268 ++End;
7269 CreatedBB =
7270 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7271 }
7272 }
7273
7274 // Legalize s_sleep_var.
7275 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7276 const DebugLoc &DL = MI.getDebugLoc();
7277 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7278 int Src0Idx =
7279 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7280 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7281 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7282 .add(Src0);
7283 Src0.ChangeToRegister(Reg, false);
7284 return nullptr;
7285 }
7286
7287 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7288 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7289 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7290 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7291 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7292 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7293 for (MachineOperand &Src : MI.explicit_operands()) {
7294 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7295 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7296 }
7297 return CreatedBB;
7298 }
7299
7300 // Legalize MUBUF instructions.
7301 bool isSoffsetLegal = true;
7302 int SoffsetIdx =
7303 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7304 if (SoffsetIdx != -1) {
7305 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7306 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7307 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7308 isSoffsetLegal = false;
7309 }
7310 }
7311
7312 bool isRsrcLegal = true;
7313 int RsrcIdx =
7314 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7315 if (RsrcIdx != -1) {
7316 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7317 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7318 isRsrcLegal = false;
7319 }
7320
7321 // The operands are legal.
7322 if (isRsrcLegal && isSoffsetLegal)
7323 return CreatedBB;
7324
7325 if (!isRsrcLegal) {
7326 // Legalize a VGPR Rsrc
7327 //
7328 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7329 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7330 // a zero-value SRsrc.
7331 //
7332 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7333 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7334 // above.
7335 //
7336 // Otherwise we are on non-ADDR64 hardware, and/or we have
7337 // idxen/offen/bothen and we fall back to a waterfall loop.
7338
7339 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7340 MachineBasicBlock &MBB = *MI.getParent();
7341
7342 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7343 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7344 // This is already an ADDR64 instruction so we need to add the pointer
7345 // extracted from the resource descriptor to the current value of VAddr.
7346 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7347 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7348 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7349
7350 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7351 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7352 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7353
7354 unsigned RsrcPtr, NewSRsrc;
7355 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7356
7357 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7358 const DebugLoc &DL = MI.getDebugLoc();
7359 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7360 .addDef(CondReg0)
7361 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7362 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7363 .addImm(0);
7364
7365 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7366 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7367 .addDef(CondReg1, RegState::Dead)
7368 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7369 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7370 .addReg(CondReg0, RegState::Kill)
7371 .addImm(0);
7372
7373 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7374 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7375 .addReg(NewVAddrLo)
7376 .addImm(AMDGPU::sub0)
7377 .addReg(NewVAddrHi)
7378 .addImm(AMDGPU::sub1);
7379
7380 VAddr->setReg(NewVAddr);
7381 Rsrc->setReg(NewSRsrc);
7382 } else if (!VAddr && ST.hasAddr64()) {
7383 // This instructions is the _OFFSET variant, so we need to convert it to
7384 // ADDR64.
7386 "FIXME: Need to emit flat atomics here");
7387
7388 unsigned RsrcPtr, NewSRsrc;
7389 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7390
7391 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7392 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7393 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7394 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7395 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7396
7397 // Atomics with return have an additional tied operand and are
7398 // missing some of the special bits.
7399 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7400 MachineInstr *Addr64;
7401
7402 if (!VDataIn) {
7403 // Regular buffer load / store.
7405 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7406 .add(*VData)
7407 .addReg(NewVAddr)
7408 .addReg(NewSRsrc)
7409 .add(*SOffset)
7410 .add(*Offset);
7411
7412 if (const MachineOperand *CPol =
7413 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7414 MIB.addImm(CPol->getImm());
7415 }
7416
7417 if (const MachineOperand *TFE =
7418 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7419 MIB.addImm(TFE->getImm());
7420 }
7421
7422 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7423
7424 MIB.cloneMemRefs(MI);
7425 Addr64 = MIB;
7426 } else {
7427 // Atomics with return.
7428 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7429 .add(*VData)
7430 .add(*VDataIn)
7431 .addReg(NewVAddr)
7432 .addReg(NewSRsrc)
7433 .add(*SOffset)
7434 .add(*Offset)
7435 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7436 .cloneMemRefs(MI);
7437 }
7438
7439 MI.removeFromParent();
7440
7441 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7442 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7443 NewVAddr)
7444 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7445 .addImm(AMDGPU::sub0)
7446 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7447 .addImm(AMDGPU::sub1);
7448 } else {
7449 // Legalize a VGPR Rsrc and soffset together.
7450 if (!isSoffsetLegal) {
7451 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7452 CreatedBB =
7453 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7454 return CreatedBB;
7455 }
7456 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7457 return CreatedBB;
7458 }
7459 }
7460
7461 // Legalize a VGPR soffset.
7462 if (!isSoffsetLegal) {
7463 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7464 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7465 return CreatedBB;
7466 }
7467 return CreatedBB;
7468}
7469
7471 InstrList.insert(MI);
7472 // Add MBUF instructiosn to deferred list.
7473 int RsrcIdx =
7474 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7475 if (RsrcIdx != -1) {
7476 DeferredList.insert(MI);
7477 }
7478}
7479
7481 return DeferredList.contains(MI);
7482}
7483
7484// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7485// lowering (change spgr to vgpr).
7486// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7487// size. Need to legalize the size of the operands during the vgpr lowering
7488// chain. This can be removed after we have sgpr16 in place
7490 MachineRegisterInfo &MRI) const {
7491 if (!ST.useRealTrue16Insts())
7492 return;
7493
7494 unsigned Opcode = MI.getOpcode();
7495 MachineBasicBlock *MBB = MI.getParent();
7496 // Legalize operands and check for size mismatch
7497 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7498 OpIdx >= get(Opcode).getNumOperands() ||
7499 get(Opcode).operands()[OpIdx].RegClass == -1)
7500 return;
7501
7502 MachineOperand &Op = MI.getOperand(OpIdx);
7503 if (!Op.isReg() || !Op.getReg().isVirtual())
7504 return;
7505
7506 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7507 if (!RI.isVGPRClass(CurrRC))
7508 return;
7509
7510 unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
7511 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7512 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7513 Op.setSubReg(AMDGPU::lo16);
7514 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7515 const DebugLoc &DL = MI.getDebugLoc();
7516 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7517 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7518 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7519 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7520 .addReg(Op.getReg())
7521 .addImm(AMDGPU::lo16)
7522 .addReg(Undef)
7523 .addImm(AMDGPU::hi16);
7524 Op.setReg(NewDstReg);
7525 }
7526}
7528 MachineRegisterInfo &MRI) const {
7529 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7531}
7532
7534 MachineDominatorTree *MDT) const {
7535
7536 while (!Worklist.empty()) {
7537 MachineInstr &Inst = *Worklist.top();
7538 Worklist.erase_top();
7539 // Skip MachineInstr in the deferred list.
7540 if (Worklist.isDeferred(&Inst))
7541 continue;
7542 moveToVALUImpl(Worklist, MDT, Inst);
7543 }
7544
7545 // Deferred list of instructions will be processed once
7546 // all the MachineInstr in the worklist are done.
7547 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7548 moveToVALUImpl(Worklist, MDT, *Inst);
7549 assert(Worklist.empty() &&
7550 "Deferred MachineInstr are not supposed to re-populate worklist");
7551 }
7552}
7553
7556 MachineInstr &Inst) const {
7557
7559 if (!MBB)
7560 return;
7562 unsigned Opcode = Inst.getOpcode();
7563 unsigned NewOpcode = getVALUOp(Inst);
7564 // Handle some special cases
7565 switch (Opcode) {
7566 default:
7567 break;
7568 case AMDGPU::S_ADD_I32:
7569 case AMDGPU::S_SUB_I32: {
7570 // FIXME: The u32 versions currently selected use the carry.
7571 bool Changed;
7572 MachineBasicBlock *CreatedBBTmp = nullptr;
7573 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7574 if (Changed)
7575 return;
7576
7577 // Default handling
7578 break;
7579 }
7580
7581 case AMDGPU::S_MUL_U64:
7582 if (ST.hasVectorMulU64()) {
7583 NewOpcode = AMDGPU::V_MUL_U64_e64;
7584 break;
7585 }
7586 // Split s_mul_u64 in 32-bit vector multiplications.
7587 splitScalarSMulU64(Worklist, Inst, MDT);
7588 Inst.eraseFromParent();
7589 return;
7590
7591 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7592 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7593 // This is a special case of s_mul_u64 where all the operands are either
7594 // zero extended or sign extended.
7595 splitScalarSMulPseudo(Worklist, Inst, MDT);
7596 Inst.eraseFromParent();
7597 return;
7598
7599 case AMDGPU::S_AND_B64:
7600 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7601 Inst.eraseFromParent();
7602 return;
7603
7604 case AMDGPU::S_OR_B64:
7605 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7606 Inst.eraseFromParent();
7607 return;
7608
7609 case AMDGPU::S_XOR_B64:
7610 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7611 Inst.eraseFromParent();
7612 return;
7613
7614 case AMDGPU::S_NAND_B64:
7615 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7616 Inst.eraseFromParent();
7617 return;
7618
7619 case AMDGPU::S_NOR_B64:
7620 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7621 Inst.eraseFromParent();
7622 return;
7623
7624 case AMDGPU::S_XNOR_B64:
7625 if (ST.hasDLInsts())
7626 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7627 else
7628 splitScalar64BitXnor(Worklist, Inst, MDT);
7629 Inst.eraseFromParent();
7630 return;
7631
7632 case AMDGPU::S_ANDN2_B64:
7633 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7634 Inst.eraseFromParent();
7635 return;
7636
7637 case AMDGPU::S_ORN2_B64:
7638 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7639 Inst.eraseFromParent();
7640 return;
7641
7642 case AMDGPU::S_BREV_B64:
7643 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7644 Inst.eraseFromParent();
7645 return;
7646
7647 case AMDGPU::S_NOT_B64:
7648 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7649 Inst.eraseFromParent();
7650 return;
7651
7652 case AMDGPU::S_BCNT1_I32_B64:
7653 splitScalar64BitBCNT(Worklist, Inst);
7654 Inst.eraseFromParent();
7655 return;
7656
7657 case AMDGPU::S_BFE_I64:
7658 splitScalar64BitBFE(Worklist, Inst);
7659 Inst.eraseFromParent();
7660 return;
7661
7662 case AMDGPU::S_FLBIT_I32_B64:
7663 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7664 Inst.eraseFromParent();
7665 return;
7666 case AMDGPU::S_FF1_I32_B64:
7667 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7668 Inst.eraseFromParent();
7669 return;
7670
7671 case AMDGPU::S_LSHL_B32:
7672 if (ST.hasOnlyRevVALUShifts()) {
7673 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7674 swapOperands(Inst);
7675 }
7676 break;
7677 case AMDGPU::S_ASHR_I32:
7678 if (ST.hasOnlyRevVALUShifts()) {
7679 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7680 swapOperands(Inst);
7681 }
7682 break;
7683 case AMDGPU::S_LSHR_B32:
7684 if (ST.hasOnlyRevVALUShifts()) {
7685 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7686 swapOperands(Inst);
7687 }
7688 break;
7689 case AMDGPU::S_LSHL_B64:
7690 if (ST.hasOnlyRevVALUShifts()) {
7691 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7692 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7693 : AMDGPU::V_LSHLREV_B64_e64;
7694 swapOperands(Inst);
7695 }
7696 break;
7697 case AMDGPU::S_ASHR_I64:
7698 if (ST.hasOnlyRevVALUShifts()) {
7699 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7700 swapOperands(Inst);
7701 }
7702 break;
7703 case AMDGPU::S_LSHR_B64:
7704 if (ST.hasOnlyRevVALUShifts()) {
7705 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7706 swapOperands(Inst);
7707 }
7708 break;
7709
7710 case AMDGPU::S_ABS_I32:
7711 lowerScalarAbs(Worklist, Inst);
7712 Inst.eraseFromParent();
7713 return;
7714
7715 case AMDGPU::S_CBRANCH_SCC0:
7716 case AMDGPU::S_CBRANCH_SCC1: {
7717 // Clear unused bits of vcc
7718 Register CondReg = Inst.getOperand(1).getReg();
7719 bool IsSCC = CondReg == AMDGPU::SCC;
7720 Register VCC = RI.getVCC();
7721 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7722 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7723 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7724 .addReg(EXEC)
7725 .addReg(IsSCC ? VCC : CondReg);
7726 Inst.removeOperand(1);
7727 } break;
7728
7729 case AMDGPU::S_BFE_U64:
7730 case AMDGPU::S_BFM_B64:
7731 llvm_unreachable("Moving this op to VALU not implemented");
7732
7733 case AMDGPU::S_PACK_LL_B32_B16:
7734 case AMDGPU::S_PACK_LH_B32_B16:
7735 case AMDGPU::S_PACK_HL_B32_B16:
7736 case AMDGPU::S_PACK_HH_B32_B16:
7737 movePackToVALU(Worklist, MRI, Inst);
7738 Inst.eraseFromParent();
7739 return;
7740
7741 case AMDGPU::S_XNOR_B32:
7742 lowerScalarXnor(Worklist, Inst);
7743 Inst.eraseFromParent();
7744 return;
7745
7746 case AMDGPU::S_NAND_B32:
7747 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7748 Inst.eraseFromParent();
7749 return;
7750
7751 case AMDGPU::S_NOR_B32:
7752 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7753 Inst.eraseFromParent();
7754 return;
7755
7756 case AMDGPU::S_ANDN2_B32:
7757 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7758 Inst.eraseFromParent();
7759 return;
7760
7761 case AMDGPU::S_ORN2_B32:
7762 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7763 Inst.eraseFromParent();
7764 return;
7765
7766 // TODO: remove as soon as everything is ready
7767 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7768 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7769 // can only be selected from the uniform SDNode.
7770 case AMDGPU::S_ADD_CO_PSEUDO:
7771 case AMDGPU::S_SUB_CO_PSEUDO: {
7772 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7773 ? AMDGPU::V_ADDC_U32_e64
7774 : AMDGPU::V_SUBB_U32_e64;
7775 const auto *CarryRC = RI.getWaveMaskRegClass();
7776
7777 Register CarryInReg = Inst.getOperand(4).getReg();
7778 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7779 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7780 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7781 .addReg(CarryInReg);
7782 }
7783
7784 Register CarryOutReg = Inst.getOperand(1).getReg();
7785
7786 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7787 MRI.getRegClass(Inst.getOperand(0).getReg())));
7788 MachineInstr *CarryOp =
7789 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7790 .addReg(CarryOutReg, RegState::Define)
7791 .add(Inst.getOperand(2))
7792 .add(Inst.getOperand(3))
7793 .addReg(CarryInReg)
7794 .addImm(0);
7795 legalizeOperands(*CarryOp);
7796 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7797 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7798 Inst.eraseFromParent();
7799 }
7800 return;
7801 case AMDGPU::S_UADDO_PSEUDO:
7802 case AMDGPU::S_USUBO_PSEUDO: {
7803 const DebugLoc &DL = Inst.getDebugLoc();
7804 MachineOperand &Dest0 = Inst.getOperand(0);
7805 MachineOperand &Dest1 = Inst.getOperand(1);
7806 MachineOperand &Src0 = Inst.getOperand(2);
7807 MachineOperand &Src1 = Inst.getOperand(3);
7808
7809 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7810 ? AMDGPU::V_ADD_CO_U32_e64
7811 : AMDGPU::V_SUB_CO_U32_e64;
7812 const TargetRegisterClass *NewRC =
7813 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7814 Register DestReg = MRI.createVirtualRegister(NewRC);
7815 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7816 .addReg(Dest1.getReg(), RegState::Define)
7817 .add(Src0)
7818 .add(Src1)
7819 .addImm(0); // clamp bit
7820
7821 legalizeOperands(*NewInstr, MDT);
7822 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7823 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7824 Worklist);
7825 Inst.eraseFromParent();
7826 }
7827 return;
7828
7829 case AMDGPU::S_CSELECT_B32:
7830 case AMDGPU::S_CSELECT_B64:
7831 lowerSelect(Worklist, Inst, MDT);
7832 Inst.eraseFromParent();
7833 return;
7834 case AMDGPU::S_CMP_EQ_I32:
7835 case AMDGPU::S_CMP_LG_I32:
7836 case AMDGPU::S_CMP_GT_I32:
7837 case AMDGPU::S_CMP_GE_I32:
7838 case AMDGPU::S_CMP_LT_I32:
7839 case AMDGPU::S_CMP_LE_I32:
7840 case AMDGPU::S_CMP_EQ_U32:
7841 case AMDGPU::S_CMP_LG_U32:
7842 case AMDGPU::S_CMP_GT_U32:
7843 case AMDGPU::S_CMP_GE_U32:
7844 case AMDGPU::S_CMP_LT_U32:
7845 case AMDGPU::S_CMP_LE_U32:
7846 case AMDGPU::S_CMP_EQ_U64:
7847 case AMDGPU::S_CMP_LG_U64:
7848 case AMDGPU::S_CMP_LT_F32:
7849 case AMDGPU::S_CMP_EQ_F32:
7850 case AMDGPU::S_CMP_LE_F32:
7851 case AMDGPU::S_CMP_GT_F32:
7852 case AMDGPU::S_CMP_LG_F32:
7853 case AMDGPU::S_CMP_GE_F32:
7854 case AMDGPU::S_CMP_O_F32:
7855 case AMDGPU::S_CMP_U_F32:
7856 case AMDGPU::S_CMP_NGE_F32:
7857 case AMDGPU::S_CMP_NLG_F32:
7858 case AMDGPU::S_CMP_NGT_F32:
7859 case AMDGPU::S_CMP_NLE_F32:
7860 case AMDGPU::S_CMP_NEQ_F32:
7861 case AMDGPU::S_CMP_NLT_F32: {
7862 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7863 auto NewInstr =
7864 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7865 .setMIFlags(Inst.getFlags());
7866 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
7867 0) {
7868 NewInstr
7869 .addImm(0) // src0_modifiers
7870 .add(Inst.getOperand(0)) // src0
7871 .addImm(0) // src1_modifiers
7872 .add(Inst.getOperand(1)) // src1
7873 .addImm(0); // clamp
7874 } else {
7875 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
7876 }
7877 legalizeOperands(*NewInstr, MDT);
7878 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7879 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7880 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7881 Inst.eraseFromParent();
7882 return;
7883 }
7884 case AMDGPU::S_CMP_LT_F16:
7885 case AMDGPU::S_CMP_EQ_F16:
7886 case AMDGPU::S_CMP_LE_F16:
7887 case AMDGPU::S_CMP_GT_F16:
7888 case AMDGPU::S_CMP_LG_F16:
7889 case AMDGPU::S_CMP_GE_F16:
7890 case AMDGPU::S_CMP_O_F16:
7891 case AMDGPU::S_CMP_U_F16:
7892 case AMDGPU::S_CMP_NGE_F16:
7893 case AMDGPU::S_CMP_NLG_F16:
7894 case AMDGPU::S_CMP_NGT_F16:
7895 case AMDGPU::S_CMP_NLE_F16:
7896 case AMDGPU::S_CMP_NEQ_F16:
7897 case AMDGPU::S_CMP_NLT_F16: {
7898 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7899 auto NewInstr =
7900 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7901 .setMIFlags(Inst.getFlags());
7902 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
7903 NewInstr
7904 .addImm(0) // src0_modifiers
7905 .add(Inst.getOperand(0)) // src0
7906 .addImm(0) // src1_modifiers
7907 .add(Inst.getOperand(1)) // src1
7908 .addImm(0); // clamp
7909 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
7910 NewInstr.addImm(0); // op_sel0
7911 } else {
7912 NewInstr
7913 .add(Inst.getOperand(0))
7914 .add(Inst.getOperand(1));
7915 }
7916 legalizeOperandsVALUt16(*NewInstr, MRI);
7917 legalizeOperands(*NewInstr, MDT);
7918 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7919 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7920 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7921 Inst.eraseFromParent();
7922 return;
7923 }
7924 case AMDGPU::S_CVT_HI_F32_F16: {
7925 const DebugLoc &DL = Inst.getDebugLoc();
7926 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7927 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7928 if (ST.useRealTrue16Insts()) {
7929 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
7930 .add(Inst.getOperand(1));
7931 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7932 .addImm(0) // src0_modifiers
7933 .addReg(TmpReg, 0, AMDGPU::hi16)
7934 .addImm(0) // clamp
7935 .addImm(0) // omod
7936 .addImm(0); // op_sel0
7937 } else {
7938 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7939 .addImm(16)
7940 .add(Inst.getOperand(1));
7941 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7942 .addImm(0) // src0_modifiers
7943 .addReg(TmpReg)
7944 .addImm(0) // clamp
7945 .addImm(0); // omod
7946 }
7947
7948 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7949 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7950 Inst.eraseFromParent();
7951 return;
7952 }
7953 case AMDGPU::S_MINIMUM_F32:
7954 case AMDGPU::S_MAXIMUM_F32: {
7955 const DebugLoc &DL = Inst.getDebugLoc();
7956 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7957 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7958 .addImm(0) // src0_modifiers
7959 .add(Inst.getOperand(1))
7960 .addImm(0) // src1_modifiers
7961 .add(Inst.getOperand(2))
7962 .addImm(0) // clamp
7963 .addImm(0); // omod
7964 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7965
7966 legalizeOperands(*NewInstr, MDT);
7967 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7968 Inst.eraseFromParent();
7969 return;
7970 }
7971 case AMDGPU::S_MINIMUM_F16:
7972 case AMDGPU::S_MAXIMUM_F16: {
7973 const DebugLoc &DL = Inst.getDebugLoc();
7974 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
7975 ? &AMDGPU::VGPR_16RegClass
7976 : &AMDGPU::VGPR_32RegClass);
7977 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7978 .addImm(0) // src0_modifiers
7979 .add(Inst.getOperand(1))
7980 .addImm(0) // src1_modifiers
7981 .add(Inst.getOperand(2))
7982 .addImm(0) // clamp
7983 .addImm(0) // omod
7984 .addImm(0); // opsel0
7985 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7986 legalizeOperandsVALUt16(*NewInstr, MRI);
7987 legalizeOperands(*NewInstr, MDT);
7988 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7989 Inst.eraseFromParent();
7990 return;
7991 }
7992 case AMDGPU::V_S_EXP_F16_e64:
7993 case AMDGPU::V_S_LOG_F16_e64:
7994 case AMDGPU::V_S_RCP_F16_e64:
7995 case AMDGPU::V_S_RSQ_F16_e64:
7996 case AMDGPU::V_S_SQRT_F16_e64: {
7997 const DebugLoc &DL = Inst.getDebugLoc();
7998 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
7999 ? &AMDGPU::VGPR_16RegClass
8000 : &AMDGPU::VGPR_32RegClass);
8001 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8002 .add(Inst.getOperand(1)) // src0_modifiers
8003 .add(Inst.getOperand(2))
8004 .add(Inst.getOperand(3)) // clamp
8005 .add(Inst.getOperand(4)) // omod
8006 .setMIFlags(Inst.getFlags());
8007 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8008 NewInstr.addImm(0); // opsel0
8009 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8010 legalizeOperandsVALUt16(*NewInstr, MRI);
8011 legalizeOperands(*NewInstr, MDT);
8012 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8013 Inst.eraseFromParent();
8014 return;
8015 }
8016 }
8017
8018 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8019 // We cannot move this instruction to the VALU, so we should try to
8020 // legalize its operands instead.
8021 legalizeOperands(Inst, MDT);
8022 return;
8023 }
8024 // Handle converting generic instructions like COPY-to-SGPR into
8025 // COPY-to-VGPR.
8026 if (NewOpcode == Opcode) {
8027 Register DstReg = Inst.getOperand(0).getReg();
8028 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8029
8030 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8031 // hope for the best.
8032 if (Inst.isCopy() && DstReg.isPhysical() &&
8033 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8034 // TODO: Only works for 32 bit registers.
8035 if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) {
8036 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8037 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
8038 .add(Inst.getOperand(1));
8039 } else {
8040 Register NewDst =
8041 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8042 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8043 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8044 .add(Inst.getOperand(1));
8045 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8046 DstReg)
8047 .addReg(NewDst);
8048 }
8049 Inst.eraseFromParent();
8050 return;
8051 }
8052
8053 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
8054 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
8055 // Instead of creating a copy where src and dst are the same register
8056 // class, we just replace all uses of dst with src. These kinds of
8057 // copies interfere with the heuristics MachineSink uses to decide
8058 // whether or not to split a critical edge. Since the pass assumes
8059 // that copies will end up as machine instructions and not be
8060 // eliminated.
8061 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8062 Register NewDstReg = Inst.getOperand(1).getReg();
8063 MRI.replaceRegWith(DstReg, NewDstReg);
8064 MRI.clearKillFlags(NewDstReg);
8065 Inst.getOperand(0).setReg(DstReg);
8066 Inst.eraseFromParent();
8067 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8068 for (MachineOperand &MO :
8069 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8070 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8071 }
8072 return;
8073 }
8074
8075 // If this is a v2s copy between 16bit and 32bit reg,
8076 // replace vgpr copy to reg_sequence/extract_subreg
8077 // This can be remove after we have sgpr16 in place
8078 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8079 Inst.getOperand(1).getReg().isVirtual() &&
8080 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8081 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8082 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8083 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8084 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8085 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8086 get(AMDGPU::IMPLICIT_DEF), Undef);
8087 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8088 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8089 .addReg(Inst.getOperand(1).getReg())
8090 .addImm(AMDGPU::lo16)
8091 .addReg(Undef)
8092 .addImm(AMDGPU::hi16);
8093 Inst.eraseFromParent();
8094 MRI.replaceRegWith(DstReg, NewDstReg);
8095 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8096 return;
8097 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8098 AMDGPU::lo16)) {
8099 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8100 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8101 MRI.replaceRegWith(DstReg, NewDstReg);
8102 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8103 return;
8104 }
8105 }
8106
8107 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8108 MRI.replaceRegWith(DstReg, NewDstReg);
8109 legalizeOperands(Inst, MDT);
8110 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8111 return;
8112 }
8113
8114 // Use the new VALU Opcode.
8115 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8116 .setMIFlags(Inst.getFlags());
8117 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8118 // Intersperse VOP3 modifiers among the SALU operands.
8119 NewInstr->addOperand(Inst.getOperand(0));
8120 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8121 AMDGPU::OpName::src0_modifiers) >= 0)
8122 NewInstr.addImm(0);
8123 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8124 MachineOperand Src = Inst.getOperand(1);
8125 NewInstr->addOperand(Src);
8126 }
8127
8128 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8129 // We are converting these to a BFE, so we need to add the missing
8130 // operands for the size and offset.
8131 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8132 NewInstr.addImm(0);
8133 NewInstr.addImm(Size);
8134 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8135 // The VALU version adds the second operand to the result, so insert an
8136 // extra 0 operand.
8137 NewInstr.addImm(0);
8138 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8139 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8140 // If we need to move this to VGPRs, we need to unpack the second
8141 // operand back into the 2 separate ones for bit offset and width.
8142 assert(OffsetWidthOp.isImm() &&
8143 "Scalar BFE is only implemented for constant width and offset");
8144 uint32_t Imm = OffsetWidthOp.getImm();
8145
8146 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8147 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8148 NewInstr.addImm(Offset);
8149 NewInstr.addImm(BitWidth);
8150 } else {
8151 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8152 AMDGPU::OpName::src1_modifiers) >= 0)
8153 NewInstr.addImm(0);
8154 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8155 NewInstr->addOperand(Inst.getOperand(2));
8156 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8157 AMDGPU::OpName::src2_modifiers) >= 0)
8158 NewInstr.addImm(0);
8159 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8160 NewInstr->addOperand(Inst.getOperand(3));
8161 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8162 NewInstr.addImm(0);
8163 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8164 NewInstr.addImm(0);
8165 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8166 NewInstr.addImm(0);
8167 }
8168 } else {
8169 // Just copy the SALU operands.
8170 for (const MachineOperand &Op : Inst.explicit_operands())
8171 NewInstr->addOperand(Op);
8172 }
8173
8174 // Remove any references to SCC. Vector instructions can't read from it, and
8175 // We're just about to add the implicit use / defs of VCC, and we don't want
8176 // both.
8177 for (MachineOperand &Op : Inst.implicit_operands()) {
8178 if (Op.getReg() == AMDGPU::SCC) {
8179 // Only propagate through live-def of SCC.
8180 if (Op.isDef() && !Op.isDead())
8181 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8182 if (Op.isUse())
8183 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8184 }
8185 }
8186 Inst.eraseFromParent();
8187 Register NewDstReg;
8188 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8189 Register DstReg = NewInstr->getOperand(0).getReg();
8190 assert(DstReg.isVirtual());
8191 // Update the destination register class.
8192 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8193 assert(NewDstRC);
8194 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8195 MRI.replaceRegWith(DstReg, NewDstReg);
8196 }
8197 fixImplicitOperands(*NewInstr);
8198
8199 legalizeOperandsVALUt16(*NewInstr, MRI);
8200
8201 // Legalize the operands
8202 legalizeOperands(*NewInstr, MDT);
8203 if (NewDstReg)
8204 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8205}
8206
8207// Add/sub require special handling to deal with carry outs.
8208std::pair<bool, MachineBasicBlock *>
8209SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8210 MachineDominatorTree *MDT) const {
8211 if (ST.hasAddNoCarry()) {
8212 // Assume there is no user of scc since we don't select this in that case.
8213 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8214 // is used.
8215
8216 MachineBasicBlock &MBB = *Inst.getParent();
8218
8219 Register OldDstReg = Inst.getOperand(0).getReg();
8220 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8221
8222 unsigned Opc = Inst.getOpcode();
8223 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8224
8225 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8226 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8227
8228 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8229 Inst.removeOperand(3);
8230
8231 Inst.setDesc(get(NewOpc));
8232 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8234 MRI.replaceRegWith(OldDstReg, ResultReg);
8235 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8236
8237 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8238 return std::pair(true, NewBB);
8239 }
8240
8241 return std::pair(false, nullptr);
8242}
8243
8244void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8245 MachineDominatorTree *MDT) const {
8246
8247 MachineBasicBlock &MBB = *Inst.getParent();
8249 MachineBasicBlock::iterator MII = Inst;
8250 DebugLoc DL = Inst.getDebugLoc();
8251
8252 MachineOperand &Dest = Inst.getOperand(0);
8253 MachineOperand &Src0 = Inst.getOperand(1);
8254 MachineOperand &Src1 = Inst.getOperand(2);
8255 MachineOperand &Cond = Inst.getOperand(3);
8256
8257 Register CondReg = Cond.getReg();
8258 bool IsSCC = (CondReg == AMDGPU::SCC);
8259
8260 // If this is a trivial select where the condition is effectively not SCC
8261 // (CondReg is a source of copy to SCC), then the select is semantically
8262 // equivalent to copying CondReg. Hence, there is no need to create
8263 // V_CNDMASK, we can just use that and bail out.
8264 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8265 (Src1.getImm() == 0)) {
8266 MRI.replaceRegWith(Dest.getReg(), CondReg);
8267 return;
8268 }
8269
8270 Register NewCondReg = CondReg;
8271 if (IsSCC) {
8273 NewCondReg = MRI.createVirtualRegister(TC);
8274
8275 // Now look for the closest SCC def if it is a copy
8276 // replacing the CondReg with the COPY source register
8277 bool CopyFound = false;
8278 for (MachineInstr &CandI :
8280 Inst.getParent()->rend())) {
8281 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8282 -1) {
8283 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8284 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8285 .addReg(CandI.getOperand(1).getReg());
8286 CopyFound = true;
8287 }
8288 break;
8289 }
8290 }
8291 if (!CopyFound) {
8292 // SCC def is not a copy
8293 // Insert a trivial select instead of creating a copy, because a copy from
8294 // SCC would semantically mean just copying a single bit, but we may need
8295 // the result to be a vector condition mask that needs preserving.
8296 unsigned Opcode =
8297 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8298 auto NewSelect =
8299 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8300 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8301 }
8302 }
8303
8304 Register NewDestReg = MRI.createVirtualRegister(
8305 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8306 MachineInstr *NewInst;
8307 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8308 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8309 .addImm(0)
8310 .add(Src1) // False
8311 .addImm(0)
8312 .add(Src0) // True
8313 .addReg(NewCondReg);
8314 } else {
8315 NewInst =
8316 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8317 .add(Src1) // False
8318 .add(Src0) // True
8319 .addReg(NewCondReg);
8320 }
8321 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8322 legalizeOperands(*NewInst, MDT);
8323 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8324}
8325
8326void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8327 MachineInstr &Inst) const {
8328 MachineBasicBlock &MBB = *Inst.getParent();
8330 MachineBasicBlock::iterator MII = Inst;
8331 DebugLoc DL = Inst.getDebugLoc();
8332
8333 MachineOperand &Dest = Inst.getOperand(0);
8334 MachineOperand &Src = Inst.getOperand(1);
8335 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8336 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8337
8338 unsigned SubOp = ST.hasAddNoCarry() ?
8339 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8340
8341 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8342 .addImm(0)
8343 .addReg(Src.getReg());
8344
8345 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8346 .addReg(Src.getReg())
8347 .addReg(TmpReg);
8348
8349 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8350 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8351}
8352
8353void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8354 MachineInstr &Inst) const {
8355 MachineBasicBlock &MBB = *Inst.getParent();
8357 MachineBasicBlock::iterator MII = Inst;
8358 const DebugLoc &DL = Inst.getDebugLoc();
8359
8360 MachineOperand &Dest = Inst.getOperand(0);
8361 MachineOperand &Src0 = Inst.getOperand(1);
8362 MachineOperand &Src1 = Inst.getOperand(2);
8363
8364 if (ST.hasDLInsts()) {
8365 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8366 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8367 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8368
8369 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8370 .add(Src0)
8371 .add(Src1);
8372
8373 MRI.replaceRegWith(Dest.getReg(), NewDest);
8374 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8375 } else {
8376 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8377 // invert either source and then perform the XOR. If either source is a
8378 // scalar register, then we can leave the inversion on the scalar unit to
8379 // achieve a better distribution of scalar and vector instructions.
8380 bool Src0IsSGPR = Src0.isReg() &&
8381 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8382 bool Src1IsSGPR = Src1.isReg() &&
8383 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8385 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8386 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8387
8388 // Build a pair of scalar instructions and add them to the work list.
8389 // The next iteration over the work list will lower these to the vector
8390 // unit as necessary.
8391 if (Src0IsSGPR) {
8392 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8393 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8394 .addReg(Temp)
8395 .add(Src1);
8396 } else if (Src1IsSGPR) {
8397 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8398 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8399 .add(Src0)
8400 .addReg(Temp);
8401 } else {
8402 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8403 .add(Src0)
8404 .add(Src1);
8405 MachineInstr *Not =
8406 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8407 Worklist.insert(Not);
8408 }
8409
8410 MRI.replaceRegWith(Dest.getReg(), NewDest);
8411
8412 Worklist.insert(Xor);
8413
8414 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8415 }
8416}
8417
8418void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8419 MachineInstr &Inst,
8420 unsigned Opcode) const {
8421 MachineBasicBlock &MBB = *Inst.getParent();
8423 MachineBasicBlock::iterator MII = Inst;
8424 const DebugLoc &DL = Inst.getDebugLoc();
8425
8426 MachineOperand &Dest = Inst.getOperand(0);
8427 MachineOperand &Src0 = Inst.getOperand(1);
8428 MachineOperand &Src1 = Inst.getOperand(2);
8429
8430 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8431 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8432
8433 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8434 .add(Src0)
8435 .add(Src1);
8436
8437 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8438 .addReg(Interm);
8439
8440 Worklist.insert(&Op);
8441 Worklist.insert(&Not);
8442
8443 MRI.replaceRegWith(Dest.getReg(), NewDest);
8444 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8445}
8446
8447void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8448 MachineInstr &Inst,
8449 unsigned Opcode) const {
8450 MachineBasicBlock &MBB = *Inst.getParent();
8452 MachineBasicBlock::iterator MII = Inst;
8453 const DebugLoc &DL = Inst.getDebugLoc();
8454
8455 MachineOperand &Dest = Inst.getOperand(0);
8456 MachineOperand &Src0 = Inst.getOperand(1);
8457 MachineOperand &Src1 = Inst.getOperand(2);
8458
8459 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8460 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8461
8462 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8463 .add(Src1);
8464
8465 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8466 .add(Src0)
8467 .addReg(Interm);
8468
8469 Worklist.insert(&Not);
8470 Worklist.insert(&Op);
8471
8472 MRI.replaceRegWith(Dest.getReg(), NewDest);
8473 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8474}
8475
8476void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8477 MachineInstr &Inst, unsigned Opcode,
8478 bool Swap) const {
8479 MachineBasicBlock &MBB = *Inst.getParent();
8481
8482 MachineOperand &Dest = Inst.getOperand(0);
8483 MachineOperand &Src0 = Inst.getOperand(1);
8484 DebugLoc DL = Inst.getDebugLoc();
8485
8486 MachineBasicBlock::iterator MII = Inst;
8487
8488 const MCInstrDesc &InstDesc = get(Opcode);
8489 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8490 MRI.getRegClass(Src0.getReg()) :
8491 &AMDGPU::SGPR_32RegClass;
8492
8493 const TargetRegisterClass *Src0SubRC =
8494 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8495
8496 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8497 AMDGPU::sub0, Src0SubRC);
8498
8499 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8500 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8501 const TargetRegisterClass *NewDestSubRC =
8502 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8503
8504 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8505 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8506
8507 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8508 AMDGPU::sub1, Src0SubRC);
8509
8510 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8511 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8512
8513 if (Swap)
8514 std::swap(DestSub0, DestSub1);
8515
8516 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8517 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8518 .addReg(DestSub0)
8519 .addImm(AMDGPU::sub0)
8520 .addReg(DestSub1)
8521 .addImm(AMDGPU::sub1);
8522
8523 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8524
8525 Worklist.insert(&LoHalf);
8526 Worklist.insert(&HiHalf);
8527
8528 // We don't need to legalizeOperands here because for a single operand, src0
8529 // will support any kind of input.
8530
8531 // Move all users of this moved value.
8532 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8533}
8534
8535// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8536// split the s_mul_u64 in 32-bit vector multiplications.
8537void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8538 MachineInstr &Inst,
8539 MachineDominatorTree *MDT) const {
8540 MachineBasicBlock &MBB = *Inst.getParent();
8542
8543 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8544 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8545 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8546
8547 MachineOperand &Dest = Inst.getOperand(0);
8548 MachineOperand &Src0 = Inst.getOperand(1);
8549 MachineOperand &Src1 = Inst.getOperand(2);
8550 const DebugLoc &DL = Inst.getDebugLoc();
8551 MachineBasicBlock::iterator MII = Inst;
8552
8553 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8554 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8555 const TargetRegisterClass *Src0SubRC =
8556 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8557 if (RI.isSGPRClass(Src0SubRC))
8558 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8559 const TargetRegisterClass *Src1SubRC =
8560 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8561 if (RI.isSGPRClass(Src1SubRC))
8562 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8563
8564 // First, we extract the low 32-bit and high 32-bit values from each of the
8565 // operands.
8566 MachineOperand Op0L =
8567 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8568 MachineOperand Op1L =
8569 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8570 MachineOperand Op0H =
8571 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8572 MachineOperand Op1H =
8573 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8574
8575 // The multilication is done as follows:
8576 //
8577 // Op1H Op1L
8578 // * Op0H Op0L
8579 // --------------------
8580 // Op1H*Op0L Op1L*Op0L
8581 // + Op1H*Op0H Op1L*Op0H
8582 // -----------------------------------------
8583 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8584 //
8585 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8586 // value and that would overflow.
8587 // The low 32-bit value is Op1L*Op0L.
8588 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8589
8590 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8591 MachineInstr *Op1L_Op0H =
8592 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8593 .add(Op1L)
8594 .add(Op0H);
8595
8596 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8597 MachineInstr *Op1H_Op0L =
8598 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8599 .add(Op1H)
8600 .add(Op0L);
8601
8602 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8603 MachineInstr *Carry =
8604 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8605 .add(Op1L)
8606 .add(Op0L);
8607
8608 MachineInstr *LoHalf =
8609 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8610 .add(Op1L)
8611 .add(Op0L);
8612
8613 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8614 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8615 .addReg(Op1L_Op0H_Reg)
8616 .addReg(Op1H_Op0L_Reg);
8617
8618 MachineInstr *HiHalf =
8619 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8620 .addReg(AddReg)
8621 .addReg(CarryReg);
8622
8623 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8624 .addReg(DestSub0)
8625 .addImm(AMDGPU::sub0)
8626 .addReg(DestSub1)
8627 .addImm(AMDGPU::sub1);
8628
8629 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8630
8631 // Try to legalize the operands in case we need to swap the order to keep it
8632 // valid.
8633 legalizeOperands(*Op1L_Op0H, MDT);
8634 legalizeOperands(*Op1H_Op0L, MDT);
8635 legalizeOperands(*Carry, MDT);
8636 legalizeOperands(*LoHalf, MDT);
8637 legalizeOperands(*Add, MDT);
8638 legalizeOperands(*HiHalf, MDT);
8639
8640 // Move all users of this moved value.
8641 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8642}
8643
8644// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8645// multiplications.
8646void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8647 MachineInstr &Inst,
8648 MachineDominatorTree *MDT) const {
8649 MachineBasicBlock &MBB = *Inst.getParent();
8651
8652 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8653 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8654 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8655
8656 MachineOperand &Dest = Inst.getOperand(0);
8657 MachineOperand &Src0 = Inst.getOperand(1);
8658 MachineOperand &Src1 = Inst.getOperand(2);
8659 const DebugLoc &DL = Inst.getDebugLoc();
8660 MachineBasicBlock::iterator MII = Inst;
8661
8662 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8663 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8664 const TargetRegisterClass *Src0SubRC =
8665 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8666 if (RI.isSGPRClass(Src0SubRC))
8667 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8668 const TargetRegisterClass *Src1SubRC =
8669 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8670 if (RI.isSGPRClass(Src1SubRC))
8671 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8672
8673 // First, we extract the low 32-bit and high 32-bit values from each of the
8674 // operands.
8675 MachineOperand Op0L =
8676 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8677 MachineOperand Op1L =
8678 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8679
8680 unsigned Opc = Inst.getOpcode();
8681 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8682 ? AMDGPU::V_MUL_HI_U32_e64
8683 : AMDGPU::V_MUL_HI_I32_e64;
8684 MachineInstr *HiHalf =
8685 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8686
8687 MachineInstr *LoHalf =
8688 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8689 .add(Op1L)
8690 .add(Op0L);
8691
8692 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8693 .addReg(DestSub0)
8694 .addImm(AMDGPU::sub0)
8695 .addReg(DestSub1)
8696 .addImm(AMDGPU::sub1);
8697
8698 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8699
8700 // Try to legalize the operands in case we need to swap the order to keep it
8701 // valid.
8702 legalizeOperands(*HiHalf, MDT);
8703 legalizeOperands(*LoHalf, MDT);
8704
8705 // Move all users of this moved value.
8706 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8707}
8708
8709void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8710 MachineInstr &Inst, unsigned Opcode,
8711 MachineDominatorTree *MDT) const {
8712 MachineBasicBlock &MBB = *Inst.getParent();
8714
8715 MachineOperand &Dest = Inst.getOperand(0);
8716 MachineOperand &Src0 = Inst.getOperand(1);
8717 MachineOperand &Src1 = Inst.getOperand(2);
8718 DebugLoc DL = Inst.getDebugLoc();
8719
8720 MachineBasicBlock::iterator MII = Inst;
8721
8722 const MCInstrDesc &InstDesc = get(Opcode);
8723 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8724 MRI.getRegClass(Src0.getReg()) :
8725 &AMDGPU::SGPR_32RegClass;
8726
8727 const TargetRegisterClass *Src0SubRC =
8728 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8729 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8730 MRI.getRegClass(Src1.getReg()) :
8731 &AMDGPU::SGPR_32RegClass;
8732
8733 const TargetRegisterClass *Src1SubRC =
8734 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8735
8736 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8737 AMDGPU::sub0, Src0SubRC);
8738 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8739 AMDGPU::sub0, Src1SubRC);
8740 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8741 AMDGPU::sub1, Src0SubRC);
8742 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8743 AMDGPU::sub1, Src1SubRC);
8744
8745 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8746 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8747 const TargetRegisterClass *NewDestSubRC =
8748 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8749
8750 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8751 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8752 .add(SrcReg0Sub0)
8753 .add(SrcReg1Sub0);
8754
8755 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8756 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8757 .add(SrcReg0Sub1)
8758 .add(SrcReg1Sub1);
8759
8760 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8761 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8762 .addReg(DestSub0)
8763 .addImm(AMDGPU::sub0)
8764 .addReg(DestSub1)
8765 .addImm(AMDGPU::sub1);
8766
8767 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8768
8769 Worklist.insert(&LoHalf);
8770 Worklist.insert(&HiHalf);
8771
8772 // Move all users of this moved value.
8773 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8774}
8775
8776void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8777 MachineInstr &Inst,
8778 MachineDominatorTree *MDT) const {
8779 MachineBasicBlock &MBB = *Inst.getParent();
8781
8782 MachineOperand &Dest = Inst.getOperand(0);
8783 MachineOperand &Src0 = Inst.getOperand(1);
8784 MachineOperand &Src1 = Inst.getOperand(2);
8785 const DebugLoc &DL = Inst.getDebugLoc();
8786
8787 MachineBasicBlock::iterator MII = Inst;
8788
8789 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8790
8791 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8792
8793 MachineOperand* Op0;
8794 MachineOperand* Op1;
8795
8796 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8797 Op0 = &Src0;
8798 Op1 = &Src1;
8799 } else {
8800 Op0 = &Src1;
8801 Op1 = &Src0;
8802 }
8803
8804 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8805 .add(*Op0);
8806
8807 Register NewDest = MRI.createVirtualRegister(DestRC);
8808
8809 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8810 .addReg(Interm)
8811 .add(*Op1);
8812
8813 MRI.replaceRegWith(Dest.getReg(), NewDest);
8814
8815 Worklist.insert(&Xor);
8816}
8817
8818void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8819 MachineInstr &Inst) const {
8820 MachineBasicBlock &MBB = *Inst.getParent();
8822
8823 MachineBasicBlock::iterator MII = Inst;
8824 const DebugLoc &DL = Inst.getDebugLoc();
8825
8826 MachineOperand &Dest = Inst.getOperand(0);
8827 MachineOperand &Src = Inst.getOperand(1);
8828
8829 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8830 const TargetRegisterClass *SrcRC = Src.isReg() ?
8831 MRI.getRegClass(Src.getReg()) :
8832 &AMDGPU::SGPR_32RegClass;
8833
8834 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8835 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8836
8837 const TargetRegisterClass *SrcSubRC =
8838 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8839
8840 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8841 AMDGPU::sub0, SrcSubRC);
8842 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8843 AMDGPU::sub1, SrcSubRC);
8844
8845 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8846
8847 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8848
8849 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8850
8851 // We don't need to legalize operands here. src0 for either instruction can be
8852 // an SGPR, and the second input is unused or determined here.
8853 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8854}
8855
8856void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8857 MachineInstr &Inst) const {
8858 MachineBasicBlock &MBB = *Inst.getParent();
8860 MachineBasicBlock::iterator MII = Inst;
8861 const DebugLoc &DL = Inst.getDebugLoc();
8862
8863 MachineOperand &Dest = Inst.getOperand(0);
8864 uint32_t Imm = Inst.getOperand(2).getImm();
8865 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8866 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8867
8868 (void) Offset;
8869
8870 // Only sext_inreg cases handled.
8871 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8872 Offset == 0 && "Not implemented");
8873
8874 if (BitWidth < 32) {
8875 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8876 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8877 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8878
8879 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8880 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8881 .addImm(0)
8882 .addImm(BitWidth);
8883
8884 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8885 .addImm(31)
8886 .addReg(MidRegLo);
8887
8888 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8889 .addReg(MidRegLo)
8890 .addImm(AMDGPU::sub0)
8891 .addReg(MidRegHi)
8892 .addImm(AMDGPU::sub1);
8893
8894 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8895 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8896 return;
8897 }
8898
8899 MachineOperand &Src = Inst.getOperand(1);
8900 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8901 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8902
8903 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8904 .addImm(31)
8905 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8906
8907 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8908 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8909 .addImm(AMDGPU::sub0)
8910 .addReg(TmpReg)
8911 .addImm(AMDGPU::sub1);
8912
8913 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8914 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8915}
8916
8917void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8918 MachineInstr &Inst, unsigned Opcode,
8919 MachineDominatorTree *MDT) const {
8920 // (S_FLBIT_I32_B64 hi:lo) ->
8921 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8922 // (S_FF1_I32_B64 hi:lo) ->
8923 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8924
8925 MachineBasicBlock &MBB = *Inst.getParent();
8927 MachineBasicBlock::iterator MII = Inst;
8928 const DebugLoc &DL = Inst.getDebugLoc();
8929
8930 MachineOperand &Dest = Inst.getOperand(0);
8931 MachineOperand &Src = Inst.getOperand(1);
8932
8933 const MCInstrDesc &InstDesc = get(Opcode);
8934
8935 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8936 unsigned OpcodeAdd =
8937 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8938
8939 const TargetRegisterClass *SrcRC =
8940 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8941 const TargetRegisterClass *SrcSubRC =
8942 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8943
8944 MachineOperand SrcRegSub0 =
8945 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8946 MachineOperand SrcRegSub1 =
8947 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8948
8949 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8950 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8951 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8952 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8953
8954 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8955
8956 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8957
8958 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8959 .addReg(IsCtlz ? MidReg1 : MidReg2)
8960 .addImm(32)
8961 .addImm(1); // enable clamp
8962
8963 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8964 .addReg(MidReg3)
8965 .addReg(IsCtlz ? MidReg2 : MidReg1);
8966
8967 MRI.replaceRegWith(Dest.getReg(), MidReg4);
8968
8969 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8970}
8971
8972void SIInstrInfo::addUsersToMoveToVALUWorklist(
8974 SIInstrWorklist &Worklist) const {
8975 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
8976 MachineInstr &UseMI = *MO.getParent();
8977
8978 unsigned OpNo = 0;
8979
8980 switch (UseMI.getOpcode()) {
8981 case AMDGPU::COPY:
8982 case AMDGPU::WQM:
8983 case AMDGPU::SOFT_WQM:
8984 case AMDGPU::STRICT_WWM:
8985 case AMDGPU::STRICT_WQM:
8986 case AMDGPU::REG_SEQUENCE:
8987 case AMDGPU::PHI:
8988 case AMDGPU::INSERT_SUBREG:
8989 break;
8990 default:
8991 OpNo = MO.getOperandNo();
8992 break;
8993 }
8994
8995 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo)))
8996 Worklist.insert(&UseMI);
8997 else
8998 // Legalization could change user list.
9000 }
9001}
9002
9003void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9005 MachineInstr &Inst) const {
9006 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9008 MachineOperand &Src0 = Inst.getOperand(1);
9009 MachineOperand &Src1 = Inst.getOperand(2);
9010 const DebugLoc &DL = Inst.getDebugLoc();
9011
9012 switch (Inst.getOpcode()) {
9013 case AMDGPU::S_PACK_LL_B32_B16: {
9014 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9015 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9016
9017 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9018 // 0.
9019 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9020 .addImm(0xffff);
9021
9022 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9023 .addReg(ImmReg, RegState::Kill)
9024 .add(Src0);
9025
9026 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9027 .add(Src1)
9028 .addImm(16)
9029 .addReg(TmpReg, RegState::Kill);
9030 break;
9031 }
9032 case AMDGPU::S_PACK_LH_B32_B16: {
9033 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9034 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9035 .addImm(0xffff);
9036 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9037 .addReg(ImmReg, RegState::Kill)
9038 .add(Src0)
9039 .add(Src1);
9040 break;
9041 }
9042 case AMDGPU::S_PACK_HL_B32_B16: {
9043 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9044 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9045 .addImm(16)
9046 .add(Src0);
9047 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9048 .add(Src1)
9049 .addImm(16)
9050 .addReg(TmpReg, RegState::Kill);
9051 break;
9052 }
9053 case AMDGPU::S_PACK_HH_B32_B16: {
9054 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9055 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9056 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9057 .addImm(16)
9058 .add(Src0);
9059 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9060 .addImm(0xffff0000);
9061 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9062 .add(Src1)
9063 .addReg(ImmReg, RegState::Kill)
9064 .addReg(TmpReg, RegState::Kill);
9065 break;
9066 }
9067 default:
9068 llvm_unreachable("unhandled s_pack_* instruction");
9069 }
9070
9071 MachineOperand &Dest = Inst.getOperand(0);
9072 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9073 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9074}
9075
9076void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
9077 MachineInstr &SCCDefInst,
9078 SIInstrWorklist &Worklist,
9079 Register NewCond) const {
9080
9081 // Ensure that def inst defines SCC, which is still live.
9082 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9083 !Op.isDead() && Op.getParent() == &SCCDefInst);
9084 SmallVector<MachineInstr *, 4> CopyToDelete;
9085 // This assumes that all the users of SCC are in the same block
9086 // as the SCC def.
9087 for (MachineInstr &MI : // Skip the def inst itself.
9088 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9089 SCCDefInst.getParent()->end())) {
9090 // Check if SCC is used first.
9091 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9092 if (SCCIdx != -1) {
9093 if (MI.isCopy()) {
9094 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9095 Register DestReg = MI.getOperand(0).getReg();
9096
9097 MRI.replaceRegWith(DestReg, NewCond);
9098 CopyToDelete.push_back(&MI);
9099 } else {
9100
9101 if (NewCond.isValid())
9102 MI.getOperand(SCCIdx).setReg(NewCond);
9103
9104 Worklist.insert(&MI);
9105 }
9106 }
9107 // Exit if we find another SCC def.
9108 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9109 break;
9110 }
9111 for (auto &Copy : CopyToDelete)
9112 Copy->eraseFromParent();
9113}
9114
9115// Instructions that use SCC may be converted to VALU instructions. When that
9116// happens, the SCC register is changed to VCC_LO. The instruction that defines
9117// SCC must be changed to an instruction that defines VCC. This function makes
9118// sure that the instruction that defines SCC is added to the moveToVALU
9119// worklist.
9120void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9121 SIInstrWorklist &Worklist) const {
9122 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9123 // then there is nothing to do because the defining instruction has been
9124 // converted to a VALU already. If SCC then that instruction needs to be
9125 // converted to a VALU.
9126 for (MachineInstr &MI :
9127 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9128 SCCUseInst->getParent()->rend())) {
9129 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9130 break;
9131 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9132 Worklist.insert(&MI);
9133 break;
9134 }
9135 }
9136}
9137
9138const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9139 const MachineInstr &Inst) const {
9140 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9141
9142 switch (Inst.getOpcode()) {
9143 // For target instructions, getOpRegClass just returns the virtual register
9144 // class associated with the operand, so we need to find an equivalent VGPR
9145 // register class in order to move the instruction to the VALU.
9146 case AMDGPU::COPY:
9147 case AMDGPU::PHI:
9148 case AMDGPU::REG_SEQUENCE:
9149 case AMDGPU::INSERT_SUBREG:
9150 case AMDGPU::WQM:
9151 case AMDGPU::SOFT_WQM:
9152 case AMDGPU::STRICT_WWM:
9153 case AMDGPU::STRICT_WQM: {
9154 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9155 if (RI.isAGPRClass(SrcRC)) {
9156 if (RI.isAGPRClass(NewDstRC))
9157 return nullptr;
9158
9159 switch (Inst.getOpcode()) {
9160 case AMDGPU::PHI:
9161 case AMDGPU::REG_SEQUENCE:
9162 case AMDGPU::INSERT_SUBREG:
9163 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9164 break;
9165 default:
9166 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9167 }
9168
9169 if (!NewDstRC)
9170 return nullptr;
9171 } else {
9172 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9173 return nullptr;
9174
9175 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9176 if (!NewDstRC)
9177 return nullptr;
9178 }
9179
9180 return NewDstRC;
9181 }
9182 default:
9183 return NewDstRC;
9184 }
9185}
9186
9187// Find the one SGPR operand we are allowed to use.
9188Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9189 int OpIndices[3]) const {
9190 const MCInstrDesc &Desc = MI.getDesc();
9191
9192 // Find the one SGPR operand we are allowed to use.
9193 //
9194 // First we need to consider the instruction's operand requirements before
9195 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9196 // of VCC, but we are still bound by the constant bus requirement to only use
9197 // one.
9198 //
9199 // If the operand's class is an SGPR, we can never move it.
9200
9201 Register SGPRReg = findImplicitSGPRRead(MI);
9202 if (SGPRReg)
9203 return SGPRReg;
9204
9205 Register UsedSGPRs[3] = {Register()};
9206 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9207
9208 for (unsigned i = 0; i < 3; ++i) {
9209 int Idx = OpIndices[i];
9210 if (Idx == -1)
9211 break;
9212
9213 const MachineOperand &MO = MI.getOperand(Idx);
9214 if (!MO.isReg())
9215 continue;
9216
9217 // Is this operand statically required to be an SGPR based on the operand
9218 // constraints?
9219 const TargetRegisterClass *OpRC =
9220 RI.getRegClass(Desc.operands()[Idx].RegClass);
9221 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9222 if (IsRequiredSGPR)
9223 return MO.getReg();
9224
9225 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9226 Register Reg = MO.getReg();
9227 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9228 if (RI.isSGPRClass(RegRC))
9229 UsedSGPRs[i] = Reg;
9230 }
9231
9232 // We don't have a required SGPR operand, so we have a bit more freedom in
9233 // selecting operands to move.
9234
9235 // Try to select the most used SGPR. If an SGPR is equal to one of the
9236 // others, we choose that.
9237 //
9238 // e.g.
9239 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9240 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9241
9242 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9243 // prefer those.
9244
9245 if (UsedSGPRs[0]) {
9246 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9247 SGPRReg = UsedSGPRs[0];
9248 }
9249
9250 if (!SGPRReg && UsedSGPRs[1]) {
9251 if (UsedSGPRs[1] == UsedSGPRs[2])
9252 SGPRReg = UsedSGPRs[1];
9253 }
9254
9255 return SGPRReg;
9256}
9257
9259 AMDGPU::OpName OperandName) const {
9260 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9261 if (Idx == -1)
9262 return nullptr;
9263
9264 return &MI.getOperand(Idx);
9265}
9266
9272 return (Format << 44) |
9273 (1ULL << 56) | // RESOURCE_LEVEL = 1
9274 (3ULL << 60); // OOB_SELECT = 3
9275 }
9276
9277 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9278 if (ST.isAmdHsaOS()) {
9279 // Set ATC = 1. GFX9 doesn't have this bit.
9281 RsrcDataFormat |= (1ULL << 56);
9282
9283 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9284 // BTW, it disables TC L2 and therefore decreases performance.
9286 RsrcDataFormat |= (2ULL << 59);
9287 }
9288
9289 return RsrcDataFormat;
9290}
9291
9295 0xffffffff; // Size;
9296
9297 // GFX9 doesn't have ELEMENT_SIZE.
9299 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9300 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9301 }
9302
9303 // IndexStride = 64 / 32.
9304 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9305 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9306
9307 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9308 // Clear them unless we want a huge stride.
9311 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9312
9313 return Rsrc23;
9314}
9315
9317 unsigned Opc = MI.getOpcode();
9318
9319 return isSMRD(Opc);
9320}
9321
9323 return get(Opc).mayLoad() &&
9324 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9325}
9326
9328 int &FrameIndex) const {
9329 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9330 if (!Addr || !Addr->isFI())
9331 return Register();
9332
9333 assert(!MI.memoperands_empty() &&
9334 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9335
9336 FrameIndex = Addr->getIndex();
9337 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9338}
9339
9341 int &FrameIndex) const {
9342 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9343 assert(Addr && Addr->isFI());
9344 FrameIndex = Addr->getIndex();
9345 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9346}
9347
9349 int &FrameIndex) const {
9350 if (!MI.mayLoad())
9351 return Register();
9352
9353 if (isMUBUF(MI) || isVGPRSpill(MI))
9354 return isStackAccess(MI, FrameIndex);
9355
9356 if (isSGPRSpill(MI))
9357 return isSGPRStackAccess(MI, FrameIndex);
9358
9359 return Register();
9360}
9361
9363 int &FrameIndex) const {
9364 if (!MI.mayStore())
9365 return Register();
9366
9367 if (isMUBUF(MI) || isVGPRSpill(MI))
9368 return isStackAccess(MI, FrameIndex);
9369
9370 if (isSGPRSpill(MI))
9371 return isSGPRStackAccess(MI, FrameIndex);
9372
9373 return Register();
9374}
9375
9377 unsigned Size = 0;
9379 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9380 while (++I != E && I->isInsideBundle()) {
9381 assert(!I->isBundle() && "No nested bundle!");
9383 }
9384
9385 return Size;
9386}
9387
9389 unsigned Opc = MI.getOpcode();
9391 unsigned DescSize = Desc.getSize();
9392
9393 // If we have a definitive size, we can use it. Otherwise we need to inspect
9394 // the operands to know the size.
9395 if (isFixedSize(MI)) {
9396 unsigned Size = DescSize;
9397
9398 // If we hit the buggy offset, an extra nop will be inserted in MC so
9399 // estimate the worst case.
9400 if (MI.isBranch() && ST.hasOffset3fBug())
9401 Size += 4;
9402
9403 return Size;
9404 }
9405
9406 // Instructions may have a 32-bit literal encoded after them. Check
9407 // operands that could ever be literals.
9408 if (isVALU(MI) || isSALU(MI)) {
9409 if (isDPP(MI))
9410 return DescSize;
9411 bool HasLiteral = false;
9412 unsigned LiteralSize = 4;
9413 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9414 const MachineOperand &Op = MI.getOperand(I);
9415 const MCOperandInfo &OpInfo = Desc.operands()[I];
9416 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9417 HasLiteral = true;
9418 if (ST.has64BitLiterals()) {
9419 switch (OpInfo.OperandType) {
9420 default:
9421 break;
9423 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9424 LiteralSize = 8;
9425 break;
9427 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9428 LiteralSize = 8;
9429 break;
9430 }
9431 }
9432 break;
9433 }
9434 }
9435 return HasLiteral ? DescSize + LiteralSize : DescSize;
9436 }
9437
9438 // Check whether we have extra NSA words.
9439 if (isMIMG(MI)) {
9440 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9441 if (VAddr0Idx < 0)
9442 return 8;
9443
9444 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9445 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9446 }
9447
9448 switch (Opc) {
9449 case TargetOpcode::BUNDLE:
9450 return getInstBundleSize(MI);
9451 case TargetOpcode::INLINEASM:
9452 case TargetOpcode::INLINEASM_BR: {
9453 const MachineFunction *MF = MI.getParent()->getParent();
9454 const char *AsmStr = MI.getOperand(0).getSymbolName();
9455 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9456 }
9457 default:
9458 if (MI.isMetaInstruction())
9459 return 0;
9460
9461 // If D16 Pseudo inst, get correct MC code size
9462 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9463 if (D16Info) {
9464 // Assume d16_lo/hi inst are always in same size
9465 unsigned LoInstOpcode = D16Info->LoOp;
9466 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9467 DescSize = Desc.getSize();
9468 }
9469
9470 return DescSize;
9471 }
9472}
9473
9475 if (!isFLAT(MI))
9476 return false;
9477
9478 if (MI.memoperands_empty())
9479 return true;
9480
9481 for (const MachineMemOperand *MMO : MI.memoperands()) {
9482 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9483 return true;
9484 }
9485 return false;
9486}
9487
9490 static const std::pair<int, const char *> TargetIndices[] = {
9491 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9492 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9493 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9494 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9495 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9496 return ArrayRef(TargetIndices);
9497}
9498
9499/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9500/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9503 const ScheduleDAG *DAG) const {
9504 return new GCNHazardRecognizer(DAG->MF);
9505}
9506
9507/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9508/// pass.
9511 return new GCNHazardRecognizer(MF);
9512}
9513
9514// Called during:
9515// - pre-RA scheduling and post-RA scheduling
9518 const ScheduleDAGMI *DAG) const {
9519 // Borrowed from Arm Target
9520 // We would like to restrict this hazard recognizer to only
9521 // post-RA scheduling; we can tell that we're post-RA because we don't
9522 // track VRegLiveness.
9523 if (!DAG->hasVRegLiveness())
9524 return new GCNHazardRecognizer(DAG->MF);
9526}
9527
9528std::pair<unsigned, unsigned>
9530 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9531}
9532
9535 static const std::pair<unsigned, const char *> TargetFlags[] = {
9536 {MO_GOTPCREL, "amdgpu-gotprel"},
9537 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9538 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9539 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9540 {MO_REL32_LO, "amdgpu-rel32-lo"},
9541 {MO_REL32_HI, "amdgpu-rel32-hi"},
9542 {MO_REL64, "amdgpu-rel64"},
9543 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9544 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9545 {MO_ABS64, "amdgpu-abs64"},
9546 };
9547
9548 return ArrayRef(TargetFlags);
9549}
9550
9553 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9554 {
9555 {MONoClobber, "amdgpu-noclobber"},
9556 {MOLastUse, "amdgpu-last-use"},
9557 };
9558
9559 return ArrayRef(TargetFlags);
9560}
9561
9563 const MachineFunction &MF) const {
9565 assert(SrcReg.isVirtual());
9566 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9567 return AMDGPU::WWM_COPY;
9568
9569 return AMDGPU::COPY;
9570}
9571
9573 Register Reg) const {
9574 // We need to handle instructions which may be inserted during register
9575 // allocation to handle the prolog. The initial prolog instruction may have
9576 // been separated from the start of the block by spills and copies inserted
9577 // needed by the prolog. However, the insertions for scalar registers can
9578 // always be placed at the BB top as they are independent of the exec mask
9579 // value.
9580 const MachineFunction *MF = MI.getParent()->getParent();
9581 bool IsNullOrVectorRegister = true;
9582 if (Reg) {
9583 const MachineRegisterInfo &MRI = MF->getRegInfo();
9584 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9585 }
9586
9587 uint16_t Opcode = MI.getOpcode();
9589 return IsNullOrVectorRegister &&
9590 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9591 (Opcode == AMDGPU::IMPLICIT_DEF &&
9592 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
9593 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9594 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9595}
9596
9600 const DebugLoc &DL,
9601 Register DestReg) const {
9602 if (ST.hasAddNoCarry())
9603 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9604
9606 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9607 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9608
9609 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9610 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9611}
9612
9615 const DebugLoc &DL,
9616 Register DestReg,
9617 RegScavenger &RS) const {
9618 if (ST.hasAddNoCarry())
9619 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9620
9621 // If available, prefer to use vcc.
9622 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9623 ? Register(RI.getVCC())
9625 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9626 0, /* AllowSpill */ false);
9627
9628 // TODO: Users need to deal with this.
9629 if (!UnusedCarry.isValid())
9630 return MachineInstrBuilder();
9631
9632 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9633 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9634}
9635
9636bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9637 switch (Opcode) {
9638 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9639 case AMDGPU::SI_KILL_I1_TERMINATOR:
9640 return true;
9641 default:
9642 return false;
9643 }
9644}
9645
9647 switch (Opcode) {
9648 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9649 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9650 case AMDGPU::SI_KILL_I1_PSEUDO:
9651 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9652 default:
9653 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9654 }
9655}
9656
9657bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9658 return Imm <= getMaxMUBUFImmOffset(ST);
9659}
9660
9662 // GFX12 field is non-negative 24-bit signed byte offset.
9663 const unsigned OffsetBits =
9664 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9665 return (1 << OffsetBits) - 1;
9666}
9667
9669 if (!ST.isWave32())
9670 return;
9671
9672 if (MI.isInlineAsm())
9673 return;
9674
9675 for (auto &Op : MI.implicit_operands()) {
9676 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9677 Op.setReg(AMDGPU::VCC_LO);
9678 }
9679}
9680
9682 if (!isSMRD(MI))
9683 return false;
9684
9685 // Check that it is using a buffer resource.
9686 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9687 if (Idx == -1) // e.g. s_memtime
9688 return false;
9689
9690 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9691 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9692}
9693
9694// Given Imm, split it into the values to put into the SOffset and ImmOffset
9695// fields in an MUBUF instruction. Return false if it is not possible (due to a
9696// hardware bug needing a workaround).
9697//
9698// The required alignment ensures that individual address components remain
9699// aligned if they are aligned to begin with. It also ensures that additional
9700// offsets within the given alignment can be added to the resulting ImmOffset.
9702 uint32_t &ImmOffset, Align Alignment) const {
9703 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9704 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9705 uint32_t Overflow = 0;
9706
9707 if (Imm > MaxImm) {
9708 if (Imm <= MaxImm + 64) {
9709 // Use an SOffset inline constant for 4..64
9710 Overflow = Imm - MaxImm;
9711 Imm = MaxImm;
9712 } else {
9713 // Try to keep the same value in SOffset for adjacent loads, so that
9714 // the corresponding register contents can be re-used.
9715 //
9716 // Load values with all low-bits (except for alignment bits) set into
9717 // SOffset, so that a larger range of values can be covered using
9718 // s_movk_i32.
9719 //
9720 // Atomic operations fail to work correctly when individual address
9721 // components are unaligned, even if their sum is aligned.
9722 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9723 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9724 Imm = Low;
9725 Overflow = High - Alignment.value();
9726 }
9727 }
9728
9729 if (Overflow > 0) {
9730 // There is a hardware bug in SI and CI which prevents address clamping in
9731 // MUBUF instructions from working correctly with SOffsets. The immediate
9732 // offset is unaffected.
9734 return false;
9735
9736 // It is not possible to set immediate in SOffset field on some targets.
9737 if (ST.hasRestrictedSOffset())
9738 return false;
9739 }
9740
9741 ImmOffset = Imm;
9742 SOffset = Overflow;
9743 return true;
9744}
9745
9746// Depending on the used address space and instructions, some immediate offsets
9747// are allowed and some are not.
9748// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9749// scratch instruction offsets can also be negative. On GFX12, offsets can be
9750// negative for all variants.
9751//
9752// There are several bugs related to these offsets:
9753// On gfx10.1, flat instructions that go into the global address space cannot
9754// use an offset.
9755//
9756// For scratch instructions, the address can be either an SGPR or a VGPR.
9757// The following offsets can be used, depending on the architecture (x means
9758// cannot be used):
9759// +----------------------------+------+------+
9760// | Address-Mode | SGPR | VGPR |
9761// +----------------------------+------+------+
9762// | gfx9 | | |
9763// | negative, 4-aligned offset | x | ok |
9764// | negative, unaligned offset | x | ok |
9765// +----------------------------+------+------+
9766// | gfx10 | | |
9767// | negative, 4-aligned offset | ok | ok |
9768// | negative, unaligned offset | ok | x |
9769// +----------------------------+------+------+
9770// | gfx10.3 | | |
9771// | negative, 4-aligned offset | ok | ok |
9772// | negative, unaligned offset | ok | ok |
9773// +----------------------------+------+------+
9774//
9775// This function ignores the addressing mode, so if an offset cannot be used in
9776// one addressing mode, it is considered illegal.
9777bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9778 uint64_t FlatVariant) const {
9779 // TODO: Should 0 be special cased?
9780 if (!ST.hasFlatInstOffsets())
9781 return false;
9782
9783 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9784 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9785 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9786 return false;
9787
9789 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9790 (Offset % 4) != 0) {
9791 return false;
9792 }
9793
9794 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9795 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9796 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9797}
9798
9799// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9800std::pair<int64_t, int64_t>
9801SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9802 uint64_t FlatVariant) const {
9803 int64_t RemainderOffset = COffsetVal;
9804 int64_t ImmField = 0;
9805
9806 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9807 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9808
9809 if (AllowNegative) {
9810 // Use signed division by a power of two to truncate towards 0.
9811 int64_t D = 1LL << NumBits;
9812 RemainderOffset = (COffsetVal / D) * D;
9813 ImmField = COffsetVal - RemainderOffset;
9814
9816 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9817 (ImmField % 4) != 0) {
9818 // Make ImmField a multiple of 4
9819 RemainderOffset += ImmField % 4;
9820 ImmField -= ImmField % 4;
9821 }
9822 } else if (COffsetVal >= 0) {
9823 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9824 RemainderOffset = COffsetVal - ImmField;
9825 }
9826
9827 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9828 assert(RemainderOffset + ImmField == COffsetVal);
9829 return {ImmField, RemainderOffset};
9830}
9831
9833 if (ST.hasNegativeScratchOffsetBug() &&
9834 FlatVariant == SIInstrFlags::FlatScratch)
9835 return false;
9836
9837 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9838}
9839
9840static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9841 switch (ST.getGeneration()) {
9842 default:
9843 break;
9846 return SIEncodingFamily::SI;
9849 return SIEncodingFamily::VI;
9855 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
9857 }
9858 llvm_unreachable("Unknown subtarget generation!");
9859}
9860
9861bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9862 switch(MCOp) {
9863 // These opcodes use indirect register addressing so
9864 // they need special handling by codegen (currently missing).
9865 // Therefore it is too risky to allow these opcodes
9866 // to be selected by dpp combiner or sdwa peepholer.
9867 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9868 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9869 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9870 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9871 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9872 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9873 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9874 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9875 return true;
9876 default:
9877 return false;
9878 }
9879}
9880
9881#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
9882 case OPCODE##_dpp: \
9883 case OPCODE##_e32: \
9884 case OPCODE##_e64: \
9885 case OPCODE##_e64_dpp: \
9886 case OPCODE##_sdwa:
9887
9888static bool isRenamedInGFX9(int Opcode) {
9889 switch (Opcode) {
9890 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
9891 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
9892 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
9893 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
9894 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
9895 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
9896 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
9897 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
9898 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
9899 //
9900 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
9901 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
9902 case AMDGPU::V_FMA_F16_gfx9_e64:
9903 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
9904 case AMDGPU::V_INTERP_P2_F16:
9905 case AMDGPU::V_MAD_F16_e64:
9906 case AMDGPU::V_MAD_U16_e64:
9907 case AMDGPU::V_MAD_I16_e64:
9908 return true;
9909 default:
9910 return false;
9911 }
9912}
9913
9914int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9915 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9916
9917 unsigned Gen = subtargetEncodingFamily(ST);
9918
9921
9922 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9923 // subtarget has UnpackedD16VMem feature.
9924 // TODO: remove this when we discard GFX80 encoding.
9925 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9927
9928 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9929 switch (ST.getGeneration()) {
9930 default:
9932 break;
9935 break;
9938 break;
9939 }
9940 }
9941
9942 if (isMAI(Opcode)) {
9943 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9944 if (MFMAOp != -1)
9945 Opcode = MFMAOp;
9946 }
9947
9948 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9949
9950 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
9952
9953 // -1 means that Opcode is already a native instruction.
9954 if (MCOp == -1)
9955 return Opcode;
9956
9957 if (ST.hasGFX90AInsts()) {
9958 uint16_t NMCOp = (uint16_t)-1;
9959 if (ST.hasGFX940Insts())
9961 if (NMCOp == (uint16_t)-1)
9963 if (NMCOp == (uint16_t)-1)
9965 if (NMCOp != (uint16_t)-1)
9966 MCOp = NMCOp;
9967 }
9968
9969 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9970 // no encoding in the given subtarget generation.
9971 if (MCOp == (uint16_t)-1)
9972 return -1;
9973
9974 if (isAsmOnlyOpcode(MCOp))
9975 return -1;
9976
9977 return MCOp;
9978}
9979
9980static
9982 assert(RegOpnd.isReg());
9983 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9984 getRegSubRegPair(RegOpnd);
9985}
9986
9989 assert(MI.isRegSequence());
9990 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9991 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
9992 auto &RegOp = MI.getOperand(1 + 2 * I);
9993 return getRegOrUndef(RegOp);
9994 }
9996}
9997
9998// Try to find the definition of reg:subreg in subreg-manipulation pseudos
9999// Following a subreg of reg:subreg isn't supported
10002 if (!RSR.SubReg)
10003 return false;
10004 switch (MI.getOpcode()) {
10005 default: break;
10006 case AMDGPU::REG_SEQUENCE:
10007 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10008 return true;
10009 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10010 case AMDGPU::INSERT_SUBREG:
10011 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10012 // inserted the subreg we're looking for
10013 RSR = getRegOrUndef(MI.getOperand(2));
10014 else { // the subreg in the rest of the reg
10015 auto R1 = getRegOrUndef(MI.getOperand(1));
10016 if (R1.SubReg) // subreg of subreg isn't supported
10017 return false;
10018 RSR.Reg = R1.Reg;
10019 }
10020 return true;
10021 }
10022 return false;
10023}
10024
10027 assert(MRI.isSSA());
10028 if (!P.Reg.isVirtual())
10029 return nullptr;
10030
10031 auto RSR = P;
10032 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10033 while (auto *MI = DefInst) {
10034 DefInst = nullptr;
10035 switch (MI->getOpcode()) {
10036 case AMDGPU::COPY:
10037 case AMDGPU::V_MOV_B32_e32: {
10038 auto &Op1 = MI->getOperand(1);
10039 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10040 if (Op1.isUndef())
10041 return nullptr;
10042 RSR = getRegSubRegPair(Op1);
10043 DefInst = MRI.getVRegDef(RSR.Reg);
10044 }
10045 break;
10046 }
10047 default:
10048 if (followSubRegDef(*MI, RSR)) {
10049 if (!RSR.Reg)
10050 return nullptr;
10051 DefInst = MRI.getVRegDef(RSR.Reg);
10052 }
10053 }
10054 if (!DefInst)
10055 return MI;
10056 }
10057 return nullptr;
10058}
10059
10061 Register VReg,
10062 const MachineInstr &DefMI,
10063 const MachineInstr &UseMI) {
10064 assert(MRI.isSSA() && "Must be run on SSA");
10065
10066 auto *TRI = MRI.getTargetRegisterInfo();
10067 auto *DefBB = DefMI.getParent();
10068
10069 // Don't bother searching between blocks, although it is possible this block
10070 // doesn't modify exec.
10071 if (UseMI.getParent() != DefBB)
10072 return true;
10073
10074 const int MaxInstScan = 20;
10075 int NumInst = 0;
10076
10077 // Stop scan at the use.
10078 auto E = UseMI.getIterator();
10079 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10080 if (I->isDebugInstr())
10081 continue;
10082
10083 if (++NumInst > MaxInstScan)
10084 return true;
10085
10086 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10087 return true;
10088 }
10089
10090 return false;
10091}
10092
10094 Register VReg,
10095 const MachineInstr &DefMI) {
10096 assert(MRI.isSSA() && "Must be run on SSA");
10097
10098 auto *TRI = MRI.getTargetRegisterInfo();
10099 auto *DefBB = DefMI.getParent();
10100
10101 const int MaxUseScan = 10;
10102 int NumUse = 0;
10103
10104 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10105 auto &UseInst = *Use.getParent();
10106 // Don't bother searching between blocks, although it is possible this block
10107 // doesn't modify exec.
10108 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10109 return true;
10110
10111 if (++NumUse > MaxUseScan)
10112 return true;
10113 }
10114
10115 if (NumUse == 0)
10116 return false;
10117
10118 const int MaxInstScan = 20;
10119 int NumInst = 0;
10120
10121 // Stop scan when we have seen all the uses.
10122 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10123 assert(I != DefBB->end());
10124
10125 if (I->isDebugInstr())
10126 continue;
10127
10128 if (++NumInst > MaxInstScan)
10129 return true;
10130
10131 for (const MachineOperand &Op : I->operands()) {
10132 // We don't check reg masks here as they're used only on calls:
10133 // 1. EXEC is only considered const within one BB
10134 // 2. Call should be a terminator instruction if present in a BB
10135
10136 if (!Op.isReg())
10137 continue;
10138
10139 Register Reg = Op.getReg();
10140 if (Op.isUse()) {
10141 if (Reg == VReg && --NumUse == 0)
10142 return false;
10143 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10144 return true;
10145 }
10146 }
10147}
10148
10151 const DebugLoc &DL, Register Src, Register Dst) const {
10152 auto Cur = MBB.begin();
10153 if (Cur != MBB.end())
10154 do {
10155 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10156 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10157 ++Cur;
10158 } while (Cur != MBB.end() && Cur != LastPHIIt);
10159
10160 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10161 Dst);
10162}
10163
10166 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10167 if (InsPt != MBB.end() &&
10168 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10169 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10170 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10171 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10172 InsPt++;
10173 return BuildMI(MBB, InsPt, DL,
10174 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
10175 : AMDGPU::S_MOV_B64_term),
10176 Dst)
10177 .addReg(Src, 0, SrcSubReg)
10178 .addReg(AMDGPU::EXEC, RegState::Implicit);
10179 }
10180 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10181 Dst);
10182}
10183
10184bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10185
10188 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10189 VirtRegMap *VRM) const {
10190 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10191 //
10192 // %0:sreg_32 = COPY $m0
10193 //
10194 // We explicitly chose SReg_32 for the virtual register so such a copy might
10195 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10196 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10197 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10198 // TargetInstrInfo::foldMemoryOperand() is going to try.
10199 // A similar issue also exists with spilling and reloading $exec registers.
10200 //
10201 // To prevent that, constrain the %0 register class here.
10202 if (isFullCopyInstr(MI)) {
10203 Register DstReg = MI.getOperand(0).getReg();
10204 Register SrcReg = MI.getOperand(1).getReg();
10205 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10206 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10208 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10209 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10210 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10211 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10212 return nullptr;
10213 }
10214 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10215 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10216 return nullptr;
10217 }
10218 }
10219 }
10220
10221 return nullptr;
10222}
10223
10225 const MachineInstr &MI,
10226 unsigned *PredCost) const {
10227 if (MI.isBundle()) {
10229 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10230 unsigned Lat = 0, Count = 0;
10231 for (++I; I != E && I->isBundledWithPred(); ++I) {
10232 ++Count;
10233 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10234 }
10235 return Lat + Count - 1;
10236 }
10237
10238 return SchedModel.computeInstrLatency(&MI);
10239}
10240
10243 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10244 unsigned opcode = MI.getOpcode();
10245
10246 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10247 Register Dst = MI.getOperand(0).getReg();
10248 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10249 : MI.getOperand(1).getReg();
10250 LLT DstTy = MRI.getType(Dst);
10251 LLT SrcTy = MRI.getType(Src);
10252 unsigned DstAS = DstTy.getAddressSpace();
10253 unsigned SrcAS = SrcTy.getAddressSpace();
10254 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10255 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10259 };
10260
10261 // If the target supports globally addressable scratch, the mapping from
10262 // scratch memory to the flat aperture changes therefore an address space cast
10263 // is no longer uniform.
10264 if (opcode == TargetOpcode::G_ADDRSPACE_CAST)
10265 return HandleAddrSpaceCast(MI);
10266
10267 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10268 auto IID = GI->getIntrinsicID();
10273
10274 switch (IID) {
10275 case Intrinsic::amdgcn_addrspacecast_nonnull:
10276 return HandleAddrSpaceCast(MI);
10277 case Intrinsic::amdgcn_if:
10278 case Intrinsic::amdgcn_else:
10279 // FIXME: Uniform if second result
10280 break;
10281 }
10282
10284 }
10285
10286 // Loads from the private and flat address spaces are divergent, because
10287 // threads can execute the load instruction with the same inputs and get
10288 // different results.
10289 //
10290 // All other loads are not divergent, because if threads issue loads with the
10291 // same arguments, they will always get the same result.
10292 if (opcode == AMDGPU::G_LOAD) {
10293 if (MI.memoperands_empty())
10294 return InstructionUniformity::NeverUniform; // conservative assumption
10295
10296 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10297 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10298 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10299 })) {
10300 // At least one MMO in a non-global address space.
10302 }
10304 }
10305
10306 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
10307 opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10308 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10309 AMDGPU::isGenericAtomic(opcode)) {
10311 }
10313}
10314
10317
10318 if (isNeverUniform(MI))
10320
10321 unsigned opcode = MI.getOpcode();
10322 if (opcode == AMDGPU::V_READLANE_B32 ||
10323 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10324 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10326
10327 if (isCopyInstr(MI)) {
10328 const MachineOperand &srcOp = MI.getOperand(1);
10329 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10330 const TargetRegisterClass *regClass =
10331 RI.getPhysRegBaseClass(srcOp.getReg());
10334 }
10336 }
10337
10338 // GMIR handling
10339 if (MI.isPreISelOpcode())
10341
10342 // Atomics are divergent because they are executed sequentially: when an
10343 // atomic operation refers to the same address in each thread, then each
10344 // thread after the first sees the value written by the previous thread as
10345 // original value.
10346
10347 if (isAtomic(MI))
10349
10350 // Loads from the private and flat address spaces are divergent, because
10351 // threads can execute the load instruction with the same inputs and get
10352 // different results.
10353 if (isFLAT(MI) && MI.mayLoad()) {
10354 if (MI.memoperands_empty())
10355 return InstructionUniformity::NeverUniform; // conservative assumption
10356
10357 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10358 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10359 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10360 })) {
10361 // At least one MMO in a non-global address space.
10363 }
10364
10366 }
10367
10368 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
10369 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10370
10371 // FIXME: It's conceptually broken to report this for an instruction, and not
10372 // a specific def operand. For inline asm in particular, there could be mixed
10373 // uniform and divergent results.
10374 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10375 const MachineOperand &SrcOp = MI.getOperand(I);
10376 if (!SrcOp.isReg())
10377 continue;
10378
10379 Register Reg = SrcOp.getReg();
10380 if (!Reg || !SrcOp.readsReg())
10381 continue;
10382
10383 // If RegBank is null, this is unassigned or an unallocatable special
10384 // register, which are all scalars.
10385 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10386 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10388 }
10389
10390 // TODO: Uniformity check condtions above can be rearranged for more
10391 // redability
10392
10393 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10394 // currently turned into no-op COPYs by SelectionDAG ISel and are
10395 // therefore no longer recognizable.
10396
10398}
10399
10401 switch (MF.getFunction().getCallingConv()) {
10403 return 1;
10405 return 2;
10407 return 3;
10411 const Function &F = MF.getFunction();
10412 F.getContext().diagnose(DiagnosticInfoUnsupported(
10413 F, "ds_ordered_count unsupported for this calling conv"));
10414 [[fallthrough]];
10415 }
10418 case CallingConv::C:
10419 case CallingConv::Fast:
10420 default:
10421 // Assume other calling conventions are various compute callable functions
10422 return 0;
10423 }
10424}
10425
10427 Register &SrcReg2, int64_t &CmpMask,
10428 int64_t &CmpValue) const {
10429 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10430 return false;
10431
10432 switch (MI.getOpcode()) {
10433 default:
10434 break;
10435 case AMDGPU::S_CMP_EQ_U32:
10436 case AMDGPU::S_CMP_EQ_I32:
10437 case AMDGPU::S_CMP_LG_U32:
10438 case AMDGPU::S_CMP_LG_I32:
10439 case AMDGPU::S_CMP_LT_U32:
10440 case AMDGPU::S_CMP_LT_I32:
10441 case AMDGPU::S_CMP_GT_U32:
10442 case AMDGPU::S_CMP_GT_I32:
10443 case AMDGPU::S_CMP_LE_U32:
10444 case AMDGPU::S_CMP_LE_I32:
10445 case AMDGPU::S_CMP_GE_U32:
10446 case AMDGPU::S_CMP_GE_I32:
10447 case AMDGPU::S_CMP_EQ_U64:
10448 case AMDGPU::S_CMP_LG_U64:
10449 SrcReg = MI.getOperand(0).getReg();
10450 if (MI.getOperand(1).isReg()) {
10451 if (MI.getOperand(1).getSubReg())
10452 return false;
10453 SrcReg2 = MI.getOperand(1).getReg();
10454 CmpValue = 0;
10455 } else if (MI.getOperand(1).isImm()) {
10456 SrcReg2 = Register();
10457 CmpValue = MI.getOperand(1).getImm();
10458 } else {
10459 return false;
10460 }
10461 CmpMask = ~0;
10462 return true;
10463 case AMDGPU::S_CMPK_EQ_U32:
10464 case AMDGPU::S_CMPK_EQ_I32:
10465 case AMDGPU::S_CMPK_LG_U32:
10466 case AMDGPU::S_CMPK_LG_I32:
10467 case AMDGPU::S_CMPK_LT_U32:
10468 case AMDGPU::S_CMPK_LT_I32:
10469 case AMDGPU::S_CMPK_GT_U32:
10470 case AMDGPU::S_CMPK_GT_I32:
10471 case AMDGPU::S_CMPK_LE_U32:
10472 case AMDGPU::S_CMPK_LE_I32:
10473 case AMDGPU::S_CMPK_GE_U32:
10474 case AMDGPU::S_CMPK_GE_I32:
10475 SrcReg = MI.getOperand(0).getReg();
10476 SrcReg2 = Register();
10477 CmpValue = MI.getOperand(1).getImm();
10478 CmpMask = ~0;
10479 return true;
10480 }
10481
10482 return false;
10483}
10484
10486 Register SrcReg2, int64_t CmpMask,
10487 int64_t CmpValue,
10488 const MachineRegisterInfo *MRI) const {
10489 if (!SrcReg || SrcReg.isPhysical())
10490 return false;
10491
10492 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10493 return false;
10494
10495 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
10496 this](int64_t ExpectedValue, unsigned SrcSize,
10497 bool IsReversible, bool IsSigned) -> bool {
10498 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10499 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10500 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10501 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10502 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
10503 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10504 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10505 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10506 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10507 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
10508 //
10509 // Signed ge/gt are not used for the sign bit.
10510 //
10511 // If result of the AND is unused except in the compare:
10512 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
10513 //
10514 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10515 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10516 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
10517 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10518 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10519 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
10520
10521 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
10522 if (!Def || Def->getParent() != CmpInstr.getParent())
10523 return false;
10524
10525 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
10526 Def->getOpcode() != AMDGPU::S_AND_B64)
10527 return false;
10528
10529 int64_t Mask;
10530 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
10531 if (MO->isImm())
10532 Mask = MO->getImm();
10533 else if (!getFoldableImm(MO, Mask))
10534 return false;
10535 Mask &= maxUIntN(SrcSize);
10536 return isPowerOf2_64(Mask);
10537 };
10538
10539 MachineOperand *SrcOp = &Def->getOperand(1);
10540 if (isMask(SrcOp))
10541 SrcOp = &Def->getOperand(2);
10542 else if (isMask(&Def->getOperand(2)))
10543 SrcOp = &Def->getOperand(1);
10544 else
10545 return false;
10546
10547 // A valid Mask is required to have a single bit set, hence a non-zero and
10548 // power-of-two value. This verifies that we will not do 64-bit shift below.
10549 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
10550 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
10551 if (IsSigned && BitNo == SrcSize - 1)
10552 return false;
10553
10554 ExpectedValue <<= BitNo;
10555
10556 bool IsReversedCC = false;
10557 if (CmpValue != ExpectedValue) {
10558 if (!IsReversible)
10559 return false;
10560 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
10561 if (!IsReversedCC)
10562 return false;
10563 }
10564
10565 Register DefReg = Def->getOperand(0).getReg();
10566 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
10567 return false;
10568
10569 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
10570 I != E; ++I) {
10571 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
10572 I->killsRegister(AMDGPU::SCC, &RI))
10573 return false;
10574 }
10575
10576 MachineOperand *SccDef =
10577 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
10578 SccDef->setIsDead(false);
10579 CmpInstr.eraseFromParent();
10580
10581 if (!MRI->use_nodbg_empty(DefReg)) {
10582 assert(!IsReversedCC);
10583 return true;
10584 }
10585
10586 // Replace AND with unused result with a S_BITCMP.
10587 MachineBasicBlock *MBB = Def->getParent();
10588
10589 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
10590 : AMDGPU::S_BITCMP1_B32
10591 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
10592 : AMDGPU::S_BITCMP1_B64;
10593
10594 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
10595 .add(*SrcOp)
10596 .addImm(BitNo);
10597 Def->eraseFromParent();
10598
10599 return true;
10600 };
10601
10602 switch (CmpInstr.getOpcode()) {
10603 default:
10604 break;
10605 case AMDGPU::S_CMP_EQ_U32:
10606 case AMDGPU::S_CMP_EQ_I32:
10607 case AMDGPU::S_CMPK_EQ_U32:
10608 case AMDGPU::S_CMPK_EQ_I32:
10609 return optimizeCmpAnd(1, 32, true, false);
10610 case AMDGPU::S_CMP_GE_U32:
10611 case AMDGPU::S_CMPK_GE_U32:
10612 return optimizeCmpAnd(1, 32, false, false);
10613 case AMDGPU::S_CMP_GE_I32:
10614 case AMDGPU::S_CMPK_GE_I32:
10615 return optimizeCmpAnd(1, 32, false, true);
10616 case AMDGPU::S_CMP_EQ_U64:
10617 return optimizeCmpAnd(1, 64, true, false);
10618 case AMDGPU::S_CMP_LG_U32:
10619 case AMDGPU::S_CMP_LG_I32:
10620 case AMDGPU::S_CMPK_LG_U32:
10621 case AMDGPU::S_CMPK_LG_I32:
10622 return optimizeCmpAnd(0, 32, true, false);
10623 case AMDGPU::S_CMP_GT_U32:
10624 case AMDGPU::S_CMPK_GT_U32:
10625 return optimizeCmpAnd(0, 32, false, false);
10626 case AMDGPU::S_CMP_GT_I32:
10627 case AMDGPU::S_CMPK_GT_I32:
10628 return optimizeCmpAnd(0, 32, false, true);
10629 case AMDGPU::S_CMP_LG_U64:
10630 return optimizeCmpAnd(0, 64, true, false);
10631 }
10632
10633 return false;
10634}
10635
10637 AMDGPU::OpName OpName) const {
10638 if (!ST.needsAlignedVGPRs())
10639 return;
10640
10641 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
10642 if (OpNo < 0)
10643 return;
10644 MachineOperand &Op = MI.getOperand(OpNo);
10645 if (getOpSize(MI, OpNo) > 4)
10646 return;
10647
10648 // Add implicit aligned super-reg to force alignment on the data operand.
10649 const DebugLoc &DL = MI.getDebugLoc();
10650 MachineBasicBlock *BB = MI.getParent();
10652 Register DataReg = Op.getReg();
10653 bool IsAGPR = RI.isAGPR(MRI, DataReg);
10654 Register Undef = MRI.createVirtualRegister(
10655 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10656 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
10657 Register NewVR =
10658 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10659 : &AMDGPU::VReg_64_Align2RegClass);
10660 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
10661 .addReg(DataReg, 0, Op.getSubReg())
10662 .addImm(AMDGPU::sub0)
10663 .addReg(Undef)
10664 .addImm(AMDGPU::sub1);
10665 Op.setReg(NewVR);
10666 Op.setSubReg(AMDGPU::sub0);
10667 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
10668}
10669
10671 if (isIGLP(*MI))
10672 return false;
10673
10675}
10676
10678 if (!isWMMA(MI) && !isSWMMAC(MI))
10679 return false;
10680
10681 if (AMDGPU::isGFX1250(ST))
10682 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
10683
10684 return true;
10685}
10686
10688 unsigned Opcode = MI.getOpcode();
10689
10690 if (AMDGPU::isGFX12Plus(ST))
10691 return isDOT(MI) || isXDLWMMA(MI);
10692
10693 if (!isMAI(MI) || isDGEMM(Opcode) ||
10694 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
10695 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
10696 return false;
10697
10698 if (!ST.hasGFX940Insts())
10699 return true;
10700
10701 return AMDGPU::getMAIIsGFX940XDL(Opcode);
10702}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MCInstrDesc &TID, unsigned RCID, bool IsAllocatable)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
Definition: SIInstrInfo.cpp:83
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
Definition: SIInstrInfo.cpp:74
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:480
#define LLVM_DEBUG(...)
Definition: Debug.h:119
bool hasBF16PackedInsts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool has16BitInsts() const
bool hasInv2PiInlineImm() const
Class for arbitrary precision integers.
Definition: APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1562
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:150
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:359
bool useVGPRIndexMode() const
bool hasSDWAOmod() const
Definition: GCNSubtarget.h:800
bool hasFlatGVSMode() const
bool hasA16() const
bool hasSDWAScalar() const
Definition: GCNSubtarget.h:804
bool hasScalarCompareEq64() const
bool hasOnlyRevVALUShifts() const
Definition: GCNSubtarget.h:434
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:678
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:820
bool hasMAIInsts() const
Definition: GCNSubtarget.h:878
bool hasFmaakFmamkF64Insts() const
bool hasScaleOffset() const
bool hasMFMAInlineLiteralBug() const
bool hasNegativeScratchOffsetBug() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasVALUMaskWriteHazard() const
bool hasGFX1250Insts() const
bool hasPkMovB32() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasR128A16() const
bool hasOffset3fBug() const
bool hasGetPCZeroExtension() const
bool hasAddPC64Inst() const
bool hasGloballyAddressableScratch() const
const AMDGPURegisterBankInfo * getRegBankInfo() const override
Definition: GCNSubtarget.h:342
bool has64BitLiterals() const
bool hasSDWAOutModsVOPC() const
Definition: GCNSubtarget.h:816
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:735
bool hasGFX940Insts() const
bool hasSDWASdst() const
Definition: GCNSubtarget.h:808
bool hasVALUReadSGPRHazard() const
bool hasMovB64() const
bool isWave32() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:383
bool hasNegativeUnalignedScratchOffsetBug() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasNoF16PseudoScalarTransInlineConstants() const
bool hasVectorMulU64() const
Generation getGeneration() const
Definition: GCNSubtarget.h:356
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:995
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:787
bool hasAddr64() const
Definition: GCNSubtarget.h:424
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:779
bool hasGDS() const
bool hasPartialNSAEncoding() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
A possibly irreducible generalization of a Loop.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:271
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:690
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:158
LLVM_ABI iterator find(SlotIndex Pos)
find - Return an iterator pointing to the first segment that ends after Pos, or end().
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:199
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:238
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:240
bool mayStore() const
Return true if this instruction could possibly modify memory.
Definition: MCInstrDesc.h:446
bool mayLoad() const
Return true if this instruction could possibly read memory.
Definition: MCInstrDesc.h:440
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
Definition: MCInstrDesc.h:249
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
Definition: MCInstrDesc.h:607
unsigned short Opcode
Definition: MCInstrDesc.h:206
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
Definition: MCInstrDesc.h:567
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition: MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:92
bool isGenericType() const
Definition: MCInstrDesc.h:119
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition: MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition: MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
unsigned pred_size() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
reverse_iterator rend()
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:72
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:587
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:359
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:590
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
Definition: MachineInstr.h:702
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:813
mop_range explicit_operands()
Definition: MachineInstr.h:696
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:798
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:780
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:511
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:595
uint32_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:404
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:29
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:46
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition: Register.h:102
constexpr bool isValid() const
Definition: Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:78
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:844
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:586
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:536
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
Definition: SIInstrInfo.h:982
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
Definition: SIInstrInfo.h:1237
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:504
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
Definition: SIInstrInfo.h:1017
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:668
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:576
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:568
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
Definition: SIInstrInfo.h:889
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:701
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:440
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:528
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
Definition: SIInstrInfo.h:857
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
Definition: SIInstrInfo.h:873
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:552
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:620
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:480
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:602
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:660
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isImage(const MachineInstr &MI)
Definition: SIInstrInfo.h:456
static bool isSOPK(const MachineInstr &MI)
Definition: SIInstrInfo.h:496
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:1030
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:560
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:650
static bool isF16PseudoScalarTrans(unsigned Opcode)
Definition: SIInstrInfo.h:1001
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:812
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:768
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
Definition: SIInstrInfo.h:800
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
Definition: SIInstrInfo.h:1389
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const override
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:733
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
static bool sopkIsZext(unsigned Opcode)
Definition: SIInstrInfo.h:921
static bool isSGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:780
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:861
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:488
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:644
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:448
bool isBarrier(unsigned Opcode) const
Definition: SIInstrInfo.h:986
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
Definition: SIInstrInfo.h:1403
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
Definition: SIInstrInfo.h:938
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:594
static bool isVOP1(const MachineInstr &MI)
Definition: SIInstrInfo.h:520
SIInstrInfo(const GCNSubtarget &ST)
Definition: SIInstrInfo.cpp:64
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
static bool hasVectorRegisters(const TargetRegisterClass *RC)
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool opCanUseInlineConstant(unsigned OpType) const
bool isVectorRegister(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
bool opCanUseLiteralConstant(unsigned OpType) const
static bool hasVGPRs(const TargetRegisterClass *RC)
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
const TargetRegisterClass * getBoolRC() const
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getChannelFromSubReg(unsigned SubReg) const
MCRegister getVCC() const
bool isVectorSuperClass(const TargetRegisterClass *RC) const
static bool hasAGPRs(const TargetRegisterClass *RC)
const TargetRegisterClass * getWaveMaskRegClass() const
bool spillSGPRToVGPR() const
const TargetRegisterClass * getVGPR64Class() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:586
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition: SlotIndexes.h:238
SlotIndexes pass.
Definition: SlotIndexes.h:298
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:532
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:283
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
const TargetRegisterClass * getAllocatableClass(const TargetRegisterClass *RC) const
Return the maximal subclass of the given register class that is allocatable or NULL.
unsigned getSubRegIdxSize(unsigned Idx) const
Get the size of the bit range covered by a sub-register index.
unsigned getSubRegIdxOffset(unsigned Idx) const
Get the offset of the bit range covered by a sub-register index.
LLVM_ABI void init(const TargetSubtargetInfo *TSInfo, bool EnableSModel=true, bool EnableSItins=true)
Initialize the machine model for instruction scheduling.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
LLVM Value Representation.
Definition: Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:174
self_iterator getIterator()
Definition: ilist_node.h:134
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
Definition: SIInstrInfo.h:1677
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
Definition: SIInstrInfo.h:1678
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo)
Is this an AMDGPU specific source operand? These include registers, inline constants,...
const uint64_t RSRC_TID_ENABLE
Definition: SIInstrInfo.h:1680
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST)
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition: SIDefines.h:231
@ OPERAND_REG_IMM_INT64
Definition: SIDefines.h:202
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition: SIDefines.h:222
@ OPERAND_REG_INLINE_C_BF16
Definition: SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:210
@ OPERAND_REG_IMM_BF16
Definition: SIDefines.h:206
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition: SIDefines.h:201
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:208
@ OPERAND_REG_IMM_FP16
Definition: SIDefines.h:207
@ OPERAND_REG_INLINE_C_INT64
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition: SIDefines.h:216
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_IMM_FP64
Definition: SIDefines.h:205
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition: SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition: SIDefines.h:237
@ OPERAND_REG_IMM_V2INT32
Definition: SIDefines.h:212
@ OPERAND_SDWA_VOPC_DST
Definition: SIDefines.h:244
@ OPERAND_REG_IMM_FP32
Definition: SIDefines.h:204
@ OPERAND_INPUT_MODS
Definition: SIDefines.h:241
@ OPERAND_REG_INLINE_C_FP32
Definition: SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:223
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition: SIDefines.h:248
@ OPERAND_REG_IMM_V2FP32
Definition: SIDefines.h:213
@ OPERAND_REG_INLINE_AC_FP64
Definition: SIDefines.h:238
@ OPERAND_REG_INLINE_C_FP16
Definition: SIDefines.h:220
@ OPERAND_REG_IMM_INT16
Definition: SIDefines.h:203
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition: SIDefines.h:228
@ TI_SCRATCH_RSRC_DWORD1
Definition: AMDGPU.h:566
@ TI_SCRATCH_RSRC_DWORD3
Definition: AMDGPU.h:568
@ TI_SCRATCH_RSRC_DWORD0
Definition: AMDGPU.h:565
@ TI_SCRATCH_RSRC_DWORD2
Definition: AMDGPU.h:567
@ TI_CONSTDATA_START
Definition: AMDGPU.h:564
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
Definition: SIInstrInfo.h:1679
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_GENERIC_4
Definition: MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition: MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition: MCInstrDesc.h:68
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:62
@ OPERAND_GENERIC_3
Definition: MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition: MCInstrDesc.h:61
@ OPERAND_MEMORY
Definition: MCInstrDesc.h:63
@ OPERAND_UNKNOWN
Definition: MCInstrDesc.h:60
@ OPERAND_GENERIC_0
Definition: MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition: MCInstrDesc.h:72
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:477
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1563
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:307
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:216
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:663
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:551
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:293
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:157
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:336
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
Definition: SPIRVUtils.cpp:976
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:164
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
Definition: TargetOpcodes.h:36
DWARFExpression::Operation Op
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition: SIInstrInfo.h:40
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
@ DS_Error
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:257
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1916
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition: Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
@ NeverUniform
The result values can never be assumed to be uniform.
@ Default
The result values are uniform if and only if all operands are uniform.
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858
#define N
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition: APFloat.cpp:219
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Description of the encoding of one expression Op.
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
Definition: LiveVariables.h:84
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition: SIInstrInfo.h:52
MachineInstr * top() const
Definition: SIInstrInfo.h:57
bool empty() const
Definition: SIInstrInfo.h:67
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition: SIInstrInfo.h:76
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.