LLVM 22.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
21#include "llvm/ADT/STLExtras.h"
32#include "llvm/IR/IntrinsicsAMDGPU.h"
33#include "llvm/MC/MCContext.h"
36
37using namespace llvm;
38
39#define DEBUG_TYPE "si-instr-info"
40
41#define GET_INSTRINFO_CTOR_DTOR
42#include "AMDGPUGenInstrInfo.inc"
43
44namespace llvm::AMDGPU {
45#define GET_D16ImageDimIntrinsics_IMPL
46#define GET_ImageDimIntrinsicTable_IMPL
47#define GET_RsrcIntrinsics_IMPL
48#include "AMDGPUGenSearchableTables.inc"
49} // namespace llvm::AMDGPU
50
51// Must be at least 4 to be able to branch over minimum unconditional branch
52// code. This is only for making it possible to write reasonably small tests for
53// long branches.
55BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
56 cl::desc("Restrict range of branch instructions (DEBUG)"));
57
59 "amdgpu-fix-16-bit-physreg-copies",
60 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
61 cl::init(true),
63
65 : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
66 RI(ST), ST(ST) {
67 SchedModel.init(&ST);
68}
69
70//===----------------------------------------------------------------------===//
71// TargetInstrInfo callbacks
72//===----------------------------------------------------------------------===//
73
74static unsigned getNumOperandsNoGlue(SDNode *Node) {
75 unsigned N = Node->getNumOperands();
76 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
77 --N;
78 return N;
79}
80
81/// Returns true if both nodes have the same value for the given
82/// operand \p Op, or if both nodes do not have this operand.
84 AMDGPU::OpName OpName) {
85 unsigned Opc0 = N0->getMachineOpcode();
86 unsigned Opc1 = N1->getMachineOpcode();
87
88 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
89 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
90
91 if (Op0Idx == -1 && Op1Idx == -1)
92 return true;
93
94
95 if ((Op0Idx == -1 && Op1Idx != -1) ||
96 (Op1Idx == -1 && Op0Idx != -1))
97 return false;
98
99 // getNamedOperandIdx returns the index for the MachineInstr's operands,
100 // which includes the result as the first operand. We are indexing into the
101 // MachineSDNode's operands, so we need to skip the result operand to get
102 // the real index.
103 --Op0Idx;
104 --Op1Idx;
105
106 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
107}
108
109static bool canRemat(const MachineInstr &MI) {
110
114 return true;
115
116 if (SIInstrInfo::isSMRD(MI)) {
117 return !MI.memoperands_empty() &&
118 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
119 return MMO->isLoad() && MMO->isInvariant();
120 });
121 }
122
123 return false;
124}
125
127 const MachineInstr &MI) const {
128
129 if (canRemat(MI)) {
130 // Normally VALU use of exec would block the rematerialization, but that
131 // is OK in this case to have an implicit exec read as all VALU do.
132 // We really want all of the generic logic for this except for this.
133
134 // Another potential implicit use is mode register. The core logic of
135 // the RA will not attempt rematerialization if mode is set anywhere
136 // in the function, otherwise it is safe since mode is not changed.
137
138 // There is difference to generic method which does not allow
139 // rematerialization if there are virtual register uses. We allow this,
140 // therefore this method includes SOP instructions as well.
141 if (!MI.hasImplicitDef() &&
142 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
143 !MI.mayRaiseFPException())
144 return true;
145 }
146
148}
149
150// Returns true if the scalar result of a VALU instruction depends on exec.
151bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
152 // Ignore comparisons which are only used masked with exec.
153 // This allows some hoisting/sinking of VALU comparisons.
154 if (MI.isCompare()) {
155 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
156 if (!Dst)
157 return true;
158
159 Register DstReg = Dst->getReg();
160 if (!DstReg.isVirtual())
161 return true;
162
163 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
164 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
165 switch (Use.getOpcode()) {
166 case AMDGPU::S_AND_SAVEEXEC_B32:
167 case AMDGPU::S_AND_SAVEEXEC_B64:
168 break;
169 case AMDGPU::S_AND_B32:
170 case AMDGPU::S_AND_B64:
171 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
172 return true;
173 break;
174 default:
175 return true;
176 }
177 }
178 return false;
179 }
180
181 switch (MI.getOpcode()) {
182 default:
183 break;
184 case AMDGPU::V_READFIRSTLANE_B32:
185 return true;
186 }
187
188 return false;
189}
190
192 // Any implicit use of exec by VALU is not a real register read.
193 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
194 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
195}
196
198 MachineBasicBlock *SuccToSinkTo,
199 MachineCycleInfo *CI) const {
200 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
201 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
202 return true;
203
204 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
205 // Check if sinking of MI would create temporal divergent use.
206 for (auto Op : MI.uses()) {
207 if (Op.isReg() && Op.getReg().isVirtual() &&
208 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
209 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
210
211 // SgprDef defined inside cycle
212 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
213 if (FromCycle == nullptr)
214 continue;
215
216 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
217 // Check if there is a FromCycle that contains SgprDef's basic block but
218 // does not contain SuccToSinkTo and also has divergent exit condition.
219 while (FromCycle && !FromCycle->contains(ToCycle)) {
221 FromCycle->getExitingBlocks(ExitingBlocks);
222
223 // FromCycle has divergent exit condition.
224 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
225 if (hasDivergentBranch(ExitingBlock))
226 return false;
227 }
228
229 FromCycle = FromCycle->getParentCycle();
230 }
231 }
232 }
233
234 return true;
235}
236
238 int64_t &Offset0,
239 int64_t &Offset1) const {
240 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
241 return false;
242
243 unsigned Opc0 = Load0->getMachineOpcode();
244 unsigned Opc1 = Load1->getMachineOpcode();
245
246 // Make sure both are actually loads.
247 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
248 return false;
249
250 // A mayLoad instruction without a def is not a load. Likely a prefetch.
251 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
252 return false;
253
254 if (isDS(Opc0) && isDS(Opc1)) {
255
256 // FIXME: Handle this case:
257 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
258 return false;
259
260 // Check base reg.
261 if (Load0->getOperand(0) != Load1->getOperand(0))
262 return false;
263
264 // Skip read2 / write2 variants for simplicity.
265 // TODO: We should report true if the used offsets are adjacent (excluded
266 // st64 versions).
267 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
268 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
269 if (Offset0Idx == -1 || Offset1Idx == -1)
270 return false;
271
272 // XXX - be careful of dataless loads
273 // getNamedOperandIdx returns the index for MachineInstrs. Since they
274 // include the output in the operand list, but SDNodes don't, we need to
275 // subtract the index by one.
276 Offset0Idx -= get(Opc0).NumDefs;
277 Offset1Idx -= get(Opc1).NumDefs;
278 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
279 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
280 return true;
281 }
282
283 if (isSMRD(Opc0) && isSMRD(Opc1)) {
284 // Skip time and cache invalidation instructions.
285 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
286 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
287 return false;
288
289 unsigned NumOps = getNumOperandsNoGlue(Load0);
290 if (NumOps != getNumOperandsNoGlue(Load1))
291 return false;
292
293 // Check base reg.
294 if (Load0->getOperand(0) != Load1->getOperand(0))
295 return false;
296
297 // Match register offsets, if both register and immediate offsets present.
298 assert(NumOps == 4 || NumOps == 5);
299 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
300 return false;
301
302 const ConstantSDNode *Load0Offset =
304 const ConstantSDNode *Load1Offset =
306
307 if (!Load0Offset || !Load1Offset)
308 return false;
309
310 Offset0 = Load0Offset->getZExtValue();
311 Offset1 = Load1Offset->getZExtValue();
312 return true;
313 }
314
315 // MUBUF and MTBUF can access the same addresses.
316 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
317
318 // MUBUF and MTBUF have vaddr at different indices.
319 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
320 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
321 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
322 return false;
323
324 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
325 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
326
327 if (OffIdx0 == -1 || OffIdx1 == -1)
328 return false;
329
330 // getNamedOperandIdx returns the index for MachineInstrs. Since they
331 // include the output in the operand list, but SDNodes don't, we need to
332 // subtract the index by one.
333 OffIdx0 -= get(Opc0).NumDefs;
334 OffIdx1 -= get(Opc1).NumDefs;
335
336 SDValue Off0 = Load0->getOperand(OffIdx0);
337 SDValue Off1 = Load1->getOperand(OffIdx1);
338
339 // The offset might be a FrameIndexSDNode.
340 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
341 return false;
342
343 Offset0 = Off0->getAsZExtVal();
344 Offset1 = Off1->getAsZExtVal();
345 return true;
346 }
347
348 return false;
349}
350
351static bool isStride64(unsigned Opc) {
352 switch (Opc) {
353 case AMDGPU::DS_READ2ST64_B32:
354 case AMDGPU::DS_READ2ST64_B64:
355 case AMDGPU::DS_WRITE2ST64_B32:
356 case AMDGPU::DS_WRITE2ST64_B64:
357 return true;
358 default:
359 return false;
360 }
361}
362
365 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
366 const TargetRegisterInfo *TRI) const {
367 if (!LdSt.mayLoadOrStore())
368 return false;
369
370 unsigned Opc = LdSt.getOpcode();
371 OffsetIsScalable = false;
372 const MachineOperand *BaseOp, *OffsetOp;
373 int DataOpIdx;
374
375 if (isDS(LdSt)) {
376 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
377 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
378 if (OffsetOp) {
379 // Normal, single offset LDS instruction.
380 if (!BaseOp) {
381 // DS_CONSUME/DS_APPEND use M0 for the base address.
382 // TODO: find the implicit use operand for M0 and use that as BaseOp?
383 return false;
384 }
385 BaseOps.push_back(BaseOp);
386 Offset = OffsetOp->getImm();
387 // Get appropriate operand, and compute width accordingly.
388 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
389 if (DataOpIdx == -1)
390 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
391 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
392 Width = LocationSize::precise(64);
393 else
394 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
395 } else {
396 // The 2 offset instructions use offset0 and offset1 instead. We can treat
397 // these as a load with a single offset if the 2 offsets are consecutive.
398 // We will use this for some partially aligned loads.
399 const MachineOperand *Offset0Op =
400 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
401 const MachineOperand *Offset1Op =
402 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
403
404 unsigned Offset0 = Offset0Op->getImm() & 0xff;
405 unsigned Offset1 = Offset1Op->getImm() & 0xff;
406 if (Offset0 + 1 != Offset1)
407 return false;
408
409 // Each of these offsets is in element sized units, so we need to convert
410 // to bytes of the individual reads.
411
412 unsigned EltSize;
413 if (LdSt.mayLoad())
414 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
415 else {
416 assert(LdSt.mayStore());
417 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
418 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
419 }
420
421 if (isStride64(Opc))
422 EltSize *= 64;
423
424 BaseOps.push_back(BaseOp);
425 Offset = EltSize * Offset0;
426 // Get appropriate operand(s), and compute width accordingly.
427 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
428 if (DataOpIdx == -1) {
429 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
430 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
431 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
432 Width = LocationSize::precise(
433 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
434 } else {
435 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
436 }
437 }
438 return true;
439 }
440
441 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
442 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
443 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
444 return false;
445 BaseOps.push_back(RSrc);
446 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
447 if (BaseOp && !BaseOp->isFI())
448 BaseOps.push_back(BaseOp);
449 const MachineOperand *OffsetImm =
450 getNamedOperand(LdSt, AMDGPU::OpName::offset);
451 Offset = OffsetImm->getImm();
452 const MachineOperand *SOffset =
453 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
454 if (SOffset) {
455 if (SOffset->isReg())
456 BaseOps.push_back(SOffset);
457 else
458 Offset += SOffset->getImm();
459 }
460 // Get appropriate operand, and compute width accordingly.
461 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
462 if (DataOpIdx == -1)
463 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
464 if (DataOpIdx == -1) // LDS DMA
465 return false;
466 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
467 return true;
468 }
469
470 if (isImage(LdSt)) {
471 auto RsrcOpName =
472 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
473 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
474 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
475 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
476 if (VAddr0Idx >= 0) {
477 // GFX10 possible NSA encoding.
478 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
479 BaseOps.push_back(&LdSt.getOperand(I));
480 } else {
481 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
482 }
483 Offset = 0;
484 // Get appropriate operand, and compute width accordingly.
485 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
486 if (DataOpIdx == -1)
487 return false; // no return sampler
488 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
489 return true;
490 }
491
492 if (isSMRD(LdSt)) {
493 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
494 if (!BaseOp) // e.g. S_MEMTIME
495 return false;
496 BaseOps.push_back(BaseOp);
497 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
498 Offset = OffsetOp ? OffsetOp->getImm() : 0;
499 // Get appropriate operand, and compute width accordingly.
500 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
501 if (DataOpIdx == -1)
502 return false;
503 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
504 return true;
505 }
506
507 if (isFLAT(LdSt)) {
508 // Instructions have either vaddr or saddr or both or none.
509 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
510 if (BaseOp)
511 BaseOps.push_back(BaseOp);
512 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
513 if (BaseOp)
514 BaseOps.push_back(BaseOp);
515 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
516 // Get appropriate operand, and compute width accordingly.
517 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
518 if (DataOpIdx == -1)
519 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
520 if (DataOpIdx == -1) // LDS DMA
521 return false;
522 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
523 return true;
524 }
525
526 return false;
527}
528
529static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
531 const MachineInstr &MI2,
533 // Only examine the first "base" operand of each instruction, on the
534 // assumption that it represents the real base address of the memory access.
535 // Other operands are typically offsets or indices from this base address.
536 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
537 return true;
538
539 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
540 return false;
541
542 auto *MO1 = *MI1.memoperands_begin();
543 auto *MO2 = *MI2.memoperands_begin();
544 if (MO1->getAddrSpace() != MO2->getAddrSpace())
545 return false;
546
547 const auto *Base1 = MO1->getValue();
548 const auto *Base2 = MO2->getValue();
549 if (!Base1 || !Base2)
550 return false;
551 Base1 = getUnderlyingObject(Base1);
552 Base2 = getUnderlyingObject(Base2);
553
554 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
555 return false;
556
557 return Base1 == Base2;
558}
559
561 int64_t Offset1, bool OffsetIsScalable1,
563 int64_t Offset2, bool OffsetIsScalable2,
564 unsigned ClusterSize,
565 unsigned NumBytes) const {
566 // If the mem ops (to be clustered) do not have the same base ptr, then they
567 // should not be clustered
568 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
569 if (!BaseOps1.empty() && !BaseOps2.empty()) {
570 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
571 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
572 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
573 return false;
574
575 const SIMachineFunctionInfo *MFI =
576 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
577 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
578 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
579 // If only one base op is empty, they do not have the same base ptr
580 return false;
581 }
582
583 // In order to avoid register pressure, on an average, the number of DWORDS
584 // loaded together by all clustered mem ops should not exceed
585 // MaxMemoryClusterDWords. This is an empirical value based on certain
586 // observations and performance related experiments.
587 // The good thing about this heuristic is - it avoids clustering of too many
588 // sub-word loads, and also avoids clustering of wide loads. Below is the
589 // brief summary of how the heuristic behaves for various `LoadSize` when
590 // MaxMemoryClusterDWords is 8.
591 //
592 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
593 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
594 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
595 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
596 // (5) LoadSize >= 17: do not cluster
597 const unsigned LoadSize = NumBytes / ClusterSize;
598 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
599 return NumDWords <= MaxMemoryClusterDWords;
600}
601
602// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
603// the first 16 loads will be interleaved with the stores, and the next 16 will
604// be clustered as expected. It should really split into 2 16 store batches.
605//
606// Loads are clustered until this returns false, rather than trying to schedule
607// groups of stores. This also means we have to deal with saying different
608// address space loads should be clustered, and ones which might cause bank
609// conflicts.
610//
611// This might be deprecated so it might not be worth that much effort to fix.
613 int64_t Offset0, int64_t Offset1,
614 unsigned NumLoads) const {
615 assert(Offset1 > Offset0 &&
616 "Second offset should be larger than first offset!");
617 // If we have less than 16 loads in a row, and the offsets are within 64
618 // bytes, then schedule together.
619
620 // A cacheline is 64 bytes (for global memory).
621 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
622}
623
626 const DebugLoc &DL, MCRegister DestReg,
627 MCRegister SrcReg, bool KillSrc,
628 const char *Msg = "illegal VGPR to SGPR copy") {
629 MachineFunction *MF = MBB.getParent();
630
632 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
633
634 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
635 .addReg(SrcReg, getKillRegState(KillSrc));
636}
637
638/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
639/// possible to have a direct copy in these cases on GFX908, so an intermediate
640/// VGPR copy is required.
644 const DebugLoc &DL, MCRegister DestReg,
645 MCRegister SrcReg, bool KillSrc,
646 RegScavenger &RS, bool RegsOverlap,
647 Register ImpDefSuperReg = Register(),
648 Register ImpUseSuperReg = Register()) {
649 assert((TII.getSubtarget().hasMAIInsts() &&
650 !TII.getSubtarget().hasGFX90AInsts()) &&
651 "Expected GFX908 subtarget.");
652
653 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
654 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
655 "Source register of the copy should be either an SGPR or an AGPR.");
656
657 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
658 "Destination register of the copy should be an AGPR.");
659
660 const SIRegisterInfo &RI = TII.getRegisterInfo();
661
662 // First try to find defining accvgpr_write to avoid temporary registers.
663 // In the case of copies of overlapping AGPRs, we conservatively do not
664 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
665 // an accvgpr_write used for this same copy due to implicit-defs
666 if (!RegsOverlap) {
667 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
668 --Def;
669
670 if (!Def->modifiesRegister(SrcReg, &RI))
671 continue;
672
673 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
674 Def->getOperand(0).getReg() != SrcReg)
675 break;
676
677 MachineOperand &DefOp = Def->getOperand(1);
678 assert(DefOp.isReg() || DefOp.isImm());
679
680 if (DefOp.isReg()) {
681 bool SafeToPropagate = true;
682 // Check that register source operand is not clobbered before MI.
683 // Immediate operands are always safe to propagate.
684 for (auto I = Def; I != MI && SafeToPropagate; ++I)
685 if (I->modifiesRegister(DefOp.getReg(), &RI))
686 SafeToPropagate = false;
687
688 if (!SafeToPropagate)
689 break;
690
691 for (auto I = Def; I != MI; ++I)
692 I->clearRegisterKills(DefOp.getReg(), &RI);
693 }
694
695 MachineInstrBuilder Builder =
696 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
697 .add(DefOp);
698 if (ImpDefSuperReg)
699 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
700
701 if (ImpUseSuperReg) {
702 Builder.addReg(ImpUseSuperReg,
704 }
705
706 return;
707 }
708 }
709
711 RS.backward(std::next(MI));
712
713 // Ideally we want to have three registers for a long reg_sequence copy
714 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
715 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
716 *MBB.getParent());
717
718 // Registers in the sequence are allocated contiguously so we can just
719 // use register number to pick one of three round-robin temps.
720 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
721 Register Tmp =
722 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
723 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
724 "VGPR used for an intermediate copy should have been reserved.");
725
726 // Only loop through if there are any free registers left. We don't want to
727 // spill.
728 while (RegNo--) {
729 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
730 /* RestoreAfter */ false, 0,
731 /* AllowSpill */ false);
732 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
733 break;
734 Tmp = Tmp2;
735 RS.setRegUsed(Tmp);
736 }
737
738 // Insert copy to temporary VGPR.
739 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
740 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
741 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
742 } else {
743 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
744 }
745
746 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
747 .addReg(SrcReg, getKillRegState(KillSrc));
748 if (ImpUseSuperReg) {
749 UseBuilder.addReg(ImpUseSuperReg,
751 }
752
753 MachineInstrBuilder DefBuilder
754 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
755 .addReg(Tmp, RegState::Kill);
756
757 if (ImpDefSuperReg)
758 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
759}
760
763 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
764 const TargetRegisterClass *RC, bool Forward) {
765 const SIRegisterInfo &RI = TII.getRegisterInfo();
766 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
768 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
769
770 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
771 int16_t SubIdx = BaseIndices[Idx];
772 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
773 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
774 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
775 unsigned Opcode = AMDGPU::S_MOV_B32;
776
777 // Is SGPR aligned? If so try to combine with next.
778 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
779 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
780 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
781 // Can use SGPR64 copy
782 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
783 SubIdx = RI.getSubRegFromChannel(Channel, 2);
784 DestSubReg = RI.getSubReg(DestReg, SubIdx);
785 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
786 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
787 Opcode = AMDGPU::S_MOV_B64;
788 Idx++;
789 }
790
791 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
792 .addReg(SrcSubReg)
793 .addReg(SrcReg, RegState::Implicit);
794
795 if (!FirstMI)
796 FirstMI = LastMI;
797
798 if (!Forward)
799 I--;
800 }
801
802 assert(FirstMI && LastMI);
803 if (!Forward)
804 std::swap(FirstMI, LastMI);
805
806 FirstMI->addOperand(
807 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
808
809 if (KillSrc)
810 LastMI->addRegisterKilled(SrcReg, &RI);
811}
812
815 const DebugLoc &DL, Register DestReg,
816 Register SrcReg, bool KillSrc, bool RenamableDest,
817 bool RenamableSrc) const {
818 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
819 unsigned Size = RI.getRegSizeInBits(*RC);
820 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
821 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
822
823 // The rest of copyPhysReg assumes Src and Dst size are the same size.
824 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
825 // we remove Fix16BitCopies and this code block?
826 if (Fix16BitCopies) {
827 if (((Size == 16) != (SrcSize == 16))) {
828 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
829 assert(ST.useRealTrue16Insts());
830 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
831 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
832 RegToFix = SubReg;
833
834 if (DestReg == SrcReg) {
835 // Identity copy. Insert empty bundle since ExpandPostRA expects an
836 // instruction here.
837 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
838 return;
839 }
840 RC = RI.getPhysRegBaseClass(DestReg);
841 Size = RI.getRegSizeInBits(*RC);
842 SrcRC = RI.getPhysRegBaseClass(SrcReg);
843 SrcSize = RI.getRegSizeInBits(*SrcRC);
844 }
845 }
846
847 if (RC == &AMDGPU::VGPR_32RegClass) {
848 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
849 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
850 AMDGPU::AGPR_32RegClass.contains(SrcReg));
851 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
852 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
853 BuildMI(MBB, MI, DL, get(Opc), DestReg)
854 .addReg(SrcReg, getKillRegState(KillSrc));
855 return;
856 }
857
858 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
859 RC == &AMDGPU::SReg_32RegClass) {
860 if (SrcReg == AMDGPU::SCC) {
861 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
862 .addImm(1)
863 .addImm(0);
864 return;
865 }
866
867 if (DestReg == AMDGPU::VCC_LO) {
868 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
869 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
870 .addReg(SrcReg, getKillRegState(KillSrc));
871 } else {
872 // FIXME: Hack until VReg_1 removed.
873 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
874 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
875 .addImm(0)
876 .addReg(SrcReg, getKillRegState(KillSrc));
877 }
878
879 return;
880 }
881
882 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
883 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
884 return;
885 }
886
887 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
888 .addReg(SrcReg, getKillRegState(KillSrc));
889 return;
890 }
891
892 if (RC == &AMDGPU::SReg_64RegClass) {
893 if (SrcReg == AMDGPU::SCC) {
894 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
895 .addImm(1)
896 .addImm(0);
897 return;
898 }
899
900 if (DestReg == AMDGPU::VCC) {
901 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
902 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
903 .addReg(SrcReg, getKillRegState(KillSrc));
904 } else {
905 // FIXME: Hack until VReg_1 removed.
906 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
907 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
908 .addImm(0)
909 .addReg(SrcReg, getKillRegState(KillSrc));
910 }
911
912 return;
913 }
914
915 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
916 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
917 return;
918 }
919
920 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
921 .addReg(SrcReg, getKillRegState(KillSrc));
922 return;
923 }
924
925 if (DestReg == AMDGPU::SCC) {
926 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
927 // but SelectionDAG emits such copies for i1 sources.
928 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
929 // This copy can only be produced by patterns
930 // with explicit SCC, which are known to be enabled
931 // only for subtargets with S_CMP_LG_U64 present.
932 assert(ST.hasScalarCompareEq64());
933 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
934 .addReg(SrcReg, getKillRegState(KillSrc))
935 .addImm(0);
936 } else {
937 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
938 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
939 .addReg(SrcReg, getKillRegState(KillSrc))
940 .addImm(0);
941 }
942
943 return;
944 }
945
946 if (RC == &AMDGPU::AGPR_32RegClass) {
947 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
948 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
949 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
950 .addReg(SrcReg, getKillRegState(KillSrc));
951 return;
952 }
953
954 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
955 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
956 .addReg(SrcReg, getKillRegState(KillSrc));
957 return;
958 }
959
960 // FIXME: Pass should maintain scavenger to avoid scan through the block on
961 // every AGPR spill.
962 RegScavenger RS;
963 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
964 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
965 return;
966 }
967
968 if (Size == 16) {
969 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
970 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
971 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
972
973 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
974 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
975 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
976 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
977 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
978 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
979 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
980 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
981
982 if (IsSGPRDst) {
983 if (!IsSGPRSrc) {
984 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
985 return;
986 }
987
988 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
989 .addReg(NewSrcReg, getKillRegState(KillSrc));
990 return;
991 }
992
993 if (IsAGPRDst || IsAGPRSrc) {
994 if (!DstLow || !SrcLow) {
995 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
996 "Cannot use hi16 subreg with an AGPR!");
997 }
998
999 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
1000 return;
1001 }
1002
1003 if (ST.useRealTrue16Insts()) {
1004 if (IsSGPRSrc) {
1005 assert(SrcLow);
1006 SrcReg = NewSrcReg;
1007 }
1008 // Use the smaller instruction encoding if possible.
1009 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1010 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1011 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1012 .addReg(SrcReg);
1013 } else {
1014 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1015 .addImm(0) // src0_modifiers
1016 .addReg(SrcReg)
1017 .addImm(0); // op_sel
1018 }
1019 return;
1020 }
1021
1022 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1023 if (!DstLow || !SrcLow) {
1024 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1025 "Cannot use hi16 subreg on VI!");
1026 }
1027
1028 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1029 .addReg(NewSrcReg, getKillRegState(KillSrc));
1030 return;
1031 }
1032
1033 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1034 .addImm(0) // src0_modifiers
1035 .addReg(NewSrcReg)
1036 .addImm(0) // clamp
1043 // First implicit operand is $exec.
1044 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1045 return;
1046 }
1047
1048 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1049 if (ST.hasMovB64()) {
1050 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1051 .addReg(SrcReg, getKillRegState(KillSrc));
1052 return;
1053 }
1054 if (ST.hasPkMovB32()) {
1055 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1057 .addReg(SrcReg)
1059 .addReg(SrcReg)
1060 .addImm(0) // op_sel_lo
1061 .addImm(0) // op_sel_hi
1062 .addImm(0) // neg_lo
1063 .addImm(0) // neg_hi
1064 .addImm(0) // clamp
1065 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1066 return;
1067 }
1068 }
1069
1070 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1071 if (RI.isSGPRClass(RC)) {
1072 if (!RI.isSGPRClass(SrcRC)) {
1073 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1074 return;
1075 }
1076 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1077 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1078 Forward);
1079 return;
1080 }
1081
1082 unsigned EltSize = 4;
1083 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1084 if (RI.isAGPRClass(RC)) {
1085 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1086 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1087 else if (RI.hasVGPRs(SrcRC) ||
1088 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1089 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1090 else
1091 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1092 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1093 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1094 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1095 (RI.isProperlyAlignedRC(*RC) &&
1096 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1097 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1098 if (ST.hasMovB64()) {
1099 Opcode = AMDGPU::V_MOV_B64_e32;
1100 EltSize = 8;
1101 } else if (ST.hasPkMovB32()) {
1102 Opcode = AMDGPU::V_PK_MOV_B32;
1103 EltSize = 8;
1104 }
1105 }
1106
1107 // For the cases where we need an intermediate instruction/temporary register
1108 // (destination is an AGPR), we need a scavenger.
1109 //
1110 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1111 // whole block for every handled copy.
1112 std::unique_ptr<RegScavenger> RS;
1113 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1114 RS = std::make_unique<RegScavenger>();
1115
1116 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1117
1118 // If there is an overlap, we can't kill the super-register on the last
1119 // instruction, since it will also kill the components made live by this def.
1120 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1121 const bool CanKillSuperReg = KillSrc && !Overlap;
1122
1123 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1124 unsigned SubIdx;
1125 if (Forward)
1126 SubIdx = SubIndices[Idx];
1127 else
1128 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1129 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1130 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1131 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1132
1133 bool IsFirstSubreg = Idx == 0;
1134 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1135
1136 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1137 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1138 Register ImpUseSuper = SrcReg;
1139 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1140 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1141 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1143 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1145 .addReg(SrcSubReg)
1147 .addReg(SrcSubReg)
1148 .addImm(0) // op_sel_lo
1149 .addImm(0) // op_sel_hi
1150 .addImm(0) // neg_lo
1151 .addImm(0) // neg_hi
1152 .addImm(0) // clamp
1153 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1154 if (IsFirstSubreg)
1156 } else {
1157 MachineInstrBuilder Builder =
1158 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1159 if (IsFirstSubreg)
1160 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1161
1162 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1163 }
1164 }
1165}
1166
1167int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1168 int NewOpc;
1169
1170 // Try to map original to commuted opcode
1171 NewOpc = AMDGPU::getCommuteRev(Opcode);
1172 if (NewOpc != -1)
1173 // Check if the commuted (REV) opcode exists on the target.
1174 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1175
1176 // Try to map commuted to original opcode
1177 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1178 if (NewOpc != -1)
1179 // Check if the original (non-REV) opcode exists on the target.
1180 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1181
1182 return Opcode;
1183}
1184
1185const TargetRegisterClass *
1187 return &AMDGPU::VGPR_32RegClass;
1188}
1189
1192 const DebugLoc &DL, Register DstReg,
1194 Register TrueReg,
1195 Register FalseReg) const {
1196 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1197 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1198 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1199 "Not a VGPR32 reg");
1200
1201 if (Cond.size() == 1) {
1202 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1203 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1204 .add(Cond[0]);
1205 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1206 .addImm(0)
1207 .addReg(FalseReg)
1208 .addImm(0)
1209 .addReg(TrueReg)
1210 .addReg(SReg);
1211 } else if (Cond.size() == 2) {
1212 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1213 switch (Cond[0].getImm()) {
1214 case SIInstrInfo::SCC_TRUE: {
1215 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1216 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1217 : AMDGPU::S_CSELECT_B64), SReg)
1218 .addImm(1)
1219 .addImm(0);
1220 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1221 .addImm(0)
1222 .addReg(FalseReg)
1223 .addImm(0)
1224 .addReg(TrueReg)
1225 .addReg(SReg);
1226 break;
1227 }
1228 case SIInstrInfo::SCC_FALSE: {
1229 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1230 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1231 : AMDGPU::S_CSELECT_B64), SReg)
1232 .addImm(0)
1233 .addImm(1);
1234 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1235 .addImm(0)
1236 .addReg(FalseReg)
1237 .addImm(0)
1238 .addReg(TrueReg)
1239 .addReg(SReg);
1240 break;
1241 }
1242 case SIInstrInfo::VCCNZ: {
1243 MachineOperand RegOp = Cond[1];
1244 RegOp.setImplicit(false);
1245 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1246 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1247 .add(RegOp);
1248 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1249 .addImm(0)
1250 .addReg(FalseReg)
1251 .addImm(0)
1252 .addReg(TrueReg)
1253 .addReg(SReg);
1254 break;
1255 }
1256 case SIInstrInfo::VCCZ: {
1257 MachineOperand RegOp = Cond[1];
1258 RegOp.setImplicit(false);
1259 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1260 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1261 .add(RegOp);
1262 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1263 .addImm(0)
1264 .addReg(TrueReg)
1265 .addImm(0)
1266 .addReg(FalseReg)
1267 .addReg(SReg);
1268 break;
1269 }
1270 case SIInstrInfo::EXECNZ: {
1271 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1272 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1273 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1274 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1275 .addImm(0);
1276 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1277 : AMDGPU::S_CSELECT_B64), SReg)
1278 .addImm(1)
1279 .addImm(0);
1280 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1281 .addImm(0)
1282 .addReg(FalseReg)
1283 .addImm(0)
1284 .addReg(TrueReg)
1285 .addReg(SReg);
1286 break;
1287 }
1288 case SIInstrInfo::EXECZ: {
1289 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1290 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1291 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1292 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1293 .addImm(0);
1294 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1295 : AMDGPU::S_CSELECT_B64), SReg)
1296 .addImm(0)
1297 .addImm(1);
1298 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1299 .addImm(0)
1300 .addReg(FalseReg)
1301 .addImm(0)
1302 .addReg(TrueReg)
1303 .addReg(SReg);
1304 llvm_unreachable("Unhandled branch predicate EXECZ");
1305 break;
1306 }
1307 default:
1308 llvm_unreachable("invalid branch predicate");
1309 }
1310 } else {
1311 llvm_unreachable("Can only handle Cond size 1 or 2");
1312 }
1313}
1314
1317 const DebugLoc &DL,
1318 Register SrcReg, int Value) const {
1319 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1320 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1321 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1322 .addImm(Value)
1323 .addReg(SrcReg);
1324
1325 return Reg;
1326}
1327
1330 const DebugLoc &DL,
1331 Register SrcReg, int Value) const {
1332 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1333 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1334 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1335 .addImm(Value)
1336 .addReg(SrcReg);
1337
1338 return Reg;
1339}
1340
1342 const Register Reg,
1343 int64_t &ImmVal) const {
1344 switch (MI.getOpcode()) {
1345 case AMDGPU::V_MOV_B32_e32:
1346 case AMDGPU::S_MOV_B32:
1347 case AMDGPU::S_MOVK_I32:
1348 case AMDGPU::S_MOV_B64:
1349 case AMDGPU::V_MOV_B64_e32:
1350 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1351 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1352 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1353 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1354 case AMDGPU::V_MOV_B64_PSEUDO: {
1355 const MachineOperand &Src0 = MI.getOperand(1);
1356 if (Src0.isImm()) {
1357 ImmVal = Src0.getImm();
1358 return MI.getOperand(0).getReg() == Reg;
1359 }
1360
1361 return false;
1362 }
1363 case AMDGPU::S_BREV_B32:
1364 case AMDGPU::V_BFREV_B32_e32:
1365 case AMDGPU::V_BFREV_B32_e64: {
1366 const MachineOperand &Src0 = MI.getOperand(1);
1367 if (Src0.isImm()) {
1368 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1369 return MI.getOperand(0).getReg() == Reg;
1370 }
1371
1372 return false;
1373 }
1374 case AMDGPU::S_NOT_B32:
1375 case AMDGPU::V_NOT_B32_e32:
1376 case AMDGPU::V_NOT_B32_e64: {
1377 const MachineOperand &Src0 = MI.getOperand(1);
1378 if (Src0.isImm()) {
1379 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1380 return MI.getOperand(0).getReg() == Reg;
1381 }
1382
1383 return false;
1384 }
1385 default:
1386 return false;
1387 }
1388}
1389
1391
1392 if (RI.isAGPRClass(DstRC))
1393 return AMDGPU::COPY;
1394 if (RI.getRegSizeInBits(*DstRC) == 16) {
1395 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1396 // before RA.
1397 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1398 }
1399 if (RI.getRegSizeInBits(*DstRC) == 32)
1400 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1401 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1402 return AMDGPU::S_MOV_B64;
1403 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1404 return AMDGPU::V_MOV_B64_PSEUDO;
1405 return AMDGPU::COPY;
1406}
1407
1408const MCInstrDesc &
1410 bool IsIndirectSrc) const {
1411 if (IsIndirectSrc) {
1412 if (VecSize <= 32) // 4 bytes
1413 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1414 if (VecSize <= 64) // 8 bytes
1415 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1416 if (VecSize <= 96) // 12 bytes
1417 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1418 if (VecSize <= 128) // 16 bytes
1419 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1420 if (VecSize <= 160) // 20 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1422 if (VecSize <= 256) // 32 bytes
1423 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1424 if (VecSize <= 288) // 36 bytes
1425 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1426 if (VecSize <= 320) // 40 bytes
1427 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1428 if (VecSize <= 352) // 44 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1430 if (VecSize <= 384) // 48 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1432 if (VecSize <= 512) // 64 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1434 if (VecSize <= 1024) // 128 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1436
1437 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1438 }
1439
1440 if (VecSize <= 32) // 4 bytes
1441 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1442 if (VecSize <= 64) // 8 bytes
1443 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1444 if (VecSize <= 96) // 12 bytes
1445 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1446 if (VecSize <= 128) // 16 bytes
1447 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1448 if (VecSize <= 160) // 20 bytes
1449 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1450 if (VecSize <= 256) // 32 bytes
1451 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1452 if (VecSize <= 288) // 36 bytes
1453 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1454 if (VecSize <= 320) // 40 bytes
1455 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1456 if (VecSize <= 352) // 44 bytes
1457 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1458 if (VecSize <= 384) // 48 bytes
1459 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1460 if (VecSize <= 512) // 64 bytes
1461 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1462 if (VecSize <= 1024) // 128 bytes
1463 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1464
1465 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1466}
1467
1468static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1469 if (VecSize <= 32) // 4 bytes
1470 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1471 if (VecSize <= 64) // 8 bytes
1472 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1473 if (VecSize <= 96) // 12 bytes
1474 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1475 if (VecSize <= 128) // 16 bytes
1476 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1477 if (VecSize <= 160) // 20 bytes
1478 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1479 if (VecSize <= 256) // 32 bytes
1480 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1481 if (VecSize <= 288) // 36 bytes
1482 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1483 if (VecSize <= 320) // 40 bytes
1484 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1485 if (VecSize <= 352) // 44 bytes
1486 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1487 if (VecSize <= 384) // 48 bytes
1488 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1489 if (VecSize <= 512) // 64 bytes
1490 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1491 if (VecSize <= 1024) // 128 bytes
1492 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1493
1494 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1495}
1496
1497static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1498 if (VecSize <= 32) // 4 bytes
1499 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1500 if (VecSize <= 64) // 8 bytes
1501 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1502 if (VecSize <= 96) // 12 bytes
1503 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1504 if (VecSize <= 128) // 16 bytes
1505 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1506 if (VecSize <= 160) // 20 bytes
1507 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1508 if (VecSize <= 256) // 32 bytes
1509 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1510 if (VecSize <= 288) // 36 bytes
1511 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1512 if (VecSize <= 320) // 40 bytes
1513 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1514 if (VecSize <= 352) // 44 bytes
1515 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1516 if (VecSize <= 384) // 48 bytes
1517 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1518 if (VecSize <= 512) // 64 bytes
1519 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1520 if (VecSize <= 1024) // 128 bytes
1521 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1522
1523 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1524}
1525
1526static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1527 if (VecSize <= 64) // 8 bytes
1528 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1529 if (VecSize <= 128) // 16 bytes
1530 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1531 if (VecSize <= 256) // 32 bytes
1532 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1533 if (VecSize <= 512) // 64 bytes
1534 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1535 if (VecSize <= 1024) // 128 bytes
1536 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1537
1538 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1539}
1540
1541const MCInstrDesc &
1542SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1543 bool IsSGPR) const {
1544 if (IsSGPR) {
1545 switch (EltSize) {
1546 case 32:
1547 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1548 case 64:
1549 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1550 default:
1551 llvm_unreachable("invalid reg indexing elt size");
1552 }
1553 }
1554
1555 assert(EltSize == 32 && "invalid reg indexing elt size");
1557}
1558
1559static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1560 switch (Size) {
1561 case 4:
1562 return AMDGPU::SI_SPILL_S32_SAVE;
1563 case 8:
1564 return AMDGPU::SI_SPILL_S64_SAVE;
1565 case 12:
1566 return AMDGPU::SI_SPILL_S96_SAVE;
1567 case 16:
1568 return AMDGPU::SI_SPILL_S128_SAVE;
1569 case 20:
1570 return AMDGPU::SI_SPILL_S160_SAVE;
1571 case 24:
1572 return AMDGPU::SI_SPILL_S192_SAVE;
1573 case 28:
1574 return AMDGPU::SI_SPILL_S224_SAVE;
1575 case 32:
1576 return AMDGPU::SI_SPILL_S256_SAVE;
1577 case 36:
1578 return AMDGPU::SI_SPILL_S288_SAVE;
1579 case 40:
1580 return AMDGPU::SI_SPILL_S320_SAVE;
1581 case 44:
1582 return AMDGPU::SI_SPILL_S352_SAVE;
1583 case 48:
1584 return AMDGPU::SI_SPILL_S384_SAVE;
1585 case 64:
1586 return AMDGPU::SI_SPILL_S512_SAVE;
1587 case 128:
1588 return AMDGPU::SI_SPILL_S1024_SAVE;
1589 default:
1590 llvm_unreachable("unknown register size");
1591 }
1592}
1593
1594static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1595 switch (Size) {
1596 case 2:
1597 return AMDGPU::SI_SPILL_V16_SAVE;
1598 case 4:
1599 return AMDGPU::SI_SPILL_V32_SAVE;
1600 case 8:
1601 return AMDGPU::SI_SPILL_V64_SAVE;
1602 case 12:
1603 return AMDGPU::SI_SPILL_V96_SAVE;
1604 case 16:
1605 return AMDGPU::SI_SPILL_V128_SAVE;
1606 case 20:
1607 return AMDGPU::SI_SPILL_V160_SAVE;
1608 case 24:
1609 return AMDGPU::SI_SPILL_V192_SAVE;
1610 case 28:
1611 return AMDGPU::SI_SPILL_V224_SAVE;
1612 case 32:
1613 return AMDGPU::SI_SPILL_V256_SAVE;
1614 case 36:
1615 return AMDGPU::SI_SPILL_V288_SAVE;
1616 case 40:
1617 return AMDGPU::SI_SPILL_V320_SAVE;
1618 case 44:
1619 return AMDGPU::SI_SPILL_V352_SAVE;
1620 case 48:
1621 return AMDGPU::SI_SPILL_V384_SAVE;
1622 case 64:
1623 return AMDGPU::SI_SPILL_V512_SAVE;
1624 case 128:
1625 return AMDGPU::SI_SPILL_V1024_SAVE;
1626 default:
1627 llvm_unreachable("unknown register size");
1628 }
1629}
1630
1631static unsigned getAVSpillSaveOpcode(unsigned Size) {
1632 switch (Size) {
1633 case 4:
1634 return AMDGPU::SI_SPILL_AV32_SAVE;
1635 case 8:
1636 return AMDGPU::SI_SPILL_AV64_SAVE;
1637 case 12:
1638 return AMDGPU::SI_SPILL_AV96_SAVE;
1639 case 16:
1640 return AMDGPU::SI_SPILL_AV128_SAVE;
1641 case 20:
1642 return AMDGPU::SI_SPILL_AV160_SAVE;
1643 case 24:
1644 return AMDGPU::SI_SPILL_AV192_SAVE;
1645 case 28:
1646 return AMDGPU::SI_SPILL_AV224_SAVE;
1647 case 32:
1648 return AMDGPU::SI_SPILL_AV256_SAVE;
1649 case 36:
1650 return AMDGPU::SI_SPILL_AV288_SAVE;
1651 case 40:
1652 return AMDGPU::SI_SPILL_AV320_SAVE;
1653 case 44:
1654 return AMDGPU::SI_SPILL_AV352_SAVE;
1655 case 48:
1656 return AMDGPU::SI_SPILL_AV384_SAVE;
1657 case 64:
1658 return AMDGPU::SI_SPILL_AV512_SAVE;
1659 case 128:
1660 return AMDGPU::SI_SPILL_AV1024_SAVE;
1661 default:
1662 llvm_unreachable("unknown register size");
1663 }
1664}
1665
1666static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1667 bool IsVectorSuperClass) {
1668 // Currently, there is only 32-bit WWM register spills needed.
1669 if (Size != 4)
1670 llvm_unreachable("unknown wwm register spill size");
1671
1672 if (IsVectorSuperClass)
1673 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1674
1675 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1676}
1677
1679 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1680 const SIMachineFunctionInfo &MFI) const {
1681 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1682
1683 // Choose the right opcode if spilling a WWM register.
1685 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1686
1687 // TODO: Check if AGPRs are available
1688 if (ST.hasMAIInsts())
1689 return getAVSpillSaveOpcode(Size);
1690
1692}
1693
1696 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1697 const TargetRegisterInfo *TRI, Register VReg,
1698 MachineInstr::MIFlag Flags) const {
1699 MachineFunction *MF = MBB.getParent();
1701 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1702 const DebugLoc &DL = MBB.findDebugLoc(MI);
1703
1704 MachinePointerInfo PtrInfo
1705 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1707 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1708 FrameInfo.getObjectAlign(FrameIndex));
1709 unsigned SpillSize = TRI->getSpillSize(*RC);
1710
1712 if (RI.isSGPRClass(RC)) {
1713 MFI->setHasSpilledSGPRs();
1714 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1715 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1716 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1717
1718 // We are only allowed to create one new instruction when spilling
1719 // registers, so we need to use pseudo instruction for spilling SGPRs.
1720 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1721
1722 // The SGPR spill/restore instructions only work on number sgprs, so we need
1723 // to make sure we are using the correct register class.
1724 if (SrcReg.isVirtual() && SpillSize == 4) {
1725 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1726 }
1727
1728 BuildMI(MBB, MI, DL, OpDesc)
1729 .addReg(SrcReg, getKillRegState(isKill)) // data
1730 .addFrameIndex(FrameIndex) // addr
1731 .addMemOperand(MMO)
1733
1734 if (RI.spillSGPRToVGPR())
1735 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1736 return;
1737 }
1738
1739 unsigned Opcode =
1740 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1741 MFI->setHasSpilledVGPRs();
1742
1743 BuildMI(MBB, MI, DL, get(Opcode))
1744 .addReg(SrcReg, getKillRegState(isKill)) // data
1745 .addFrameIndex(FrameIndex) // addr
1746 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1747 .addImm(0) // offset
1748 .addMemOperand(MMO);
1749}
1750
1751static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1752 switch (Size) {
1753 case 4:
1754 return AMDGPU::SI_SPILL_S32_RESTORE;
1755 case 8:
1756 return AMDGPU::SI_SPILL_S64_RESTORE;
1757 case 12:
1758 return AMDGPU::SI_SPILL_S96_RESTORE;
1759 case 16:
1760 return AMDGPU::SI_SPILL_S128_RESTORE;
1761 case 20:
1762 return AMDGPU::SI_SPILL_S160_RESTORE;
1763 case 24:
1764 return AMDGPU::SI_SPILL_S192_RESTORE;
1765 case 28:
1766 return AMDGPU::SI_SPILL_S224_RESTORE;
1767 case 32:
1768 return AMDGPU::SI_SPILL_S256_RESTORE;
1769 case 36:
1770 return AMDGPU::SI_SPILL_S288_RESTORE;
1771 case 40:
1772 return AMDGPU::SI_SPILL_S320_RESTORE;
1773 case 44:
1774 return AMDGPU::SI_SPILL_S352_RESTORE;
1775 case 48:
1776 return AMDGPU::SI_SPILL_S384_RESTORE;
1777 case 64:
1778 return AMDGPU::SI_SPILL_S512_RESTORE;
1779 case 128:
1780 return AMDGPU::SI_SPILL_S1024_RESTORE;
1781 default:
1782 llvm_unreachable("unknown register size");
1783 }
1784}
1785
1786static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1787 switch (Size) {
1788 case 2:
1789 return AMDGPU::SI_SPILL_V16_RESTORE;
1790 case 4:
1791 return AMDGPU::SI_SPILL_V32_RESTORE;
1792 case 8:
1793 return AMDGPU::SI_SPILL_V64_RESTORE;
1794 case 12:
1795 return AMDGPU::SI_SPILL_V96_RESTORE;
1796 case 16:
1797 return AMDGPU::SI_SPILL_V128_RESTORE;
1798 case 20:
1799 return AMDGPU::SI_SPILL_V160_RESTORE;
1800 case 24:
1801 return AMDGPU::SI_SPILL_V192_RESTORE;
1802 case 28:
1803 return AMDGPU::SI_SPILL_V224_RESTORE;
1804 case 32:
1805 return AMDGPU::SI_SPILL_V256_RESTORE;
1806 case 36:
1807 return AMDGPU::SI_SPILL_V288_RESTORE;
1808 case 40:
1809 return AMDGPU::SI_SPILL_V320_RESTORE;
1810 case 44:
1811 return AMDGPU::SI_SPILL_V352_RESTORE;
1812 case 48:
1813 return AMDGPU::SI_SPILL_V384_RESTORE;
1814 case 64:
1815 return AMDGPU::SI_SPILL_V512_RESTORE;
1816 case 128:
1817 return AMDGPU::SI_SPILL_V1024_RESTORE;
1818 default:
1819 llvm_unreachable("unknown register size");
1820 }
1821}
1822
1823static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1824 switch (Size) {
1825 case 4:
1826 return AMDGPU::SI_SPILL_AV32_RESTORE;
1827 case 8:
1828 return AMDGPU::SI_SPILL_AV64_RESTORE;
1829 case 12:
1830 return AMDGPU::SI_SPILL_AV96_RESTORE;
1831 case 16:
1832 return AMDGPU::SI_SPILL_AV128_RESTORE;
1833 case 20:
1834 return AMDGPU::SI_SPILL_AV160_RESTORE;
1835 case 24:
1836 return AMDGPU::SI_SPILL_AV192_RESTORE;
1837 case 28:
1838 return AMDGPU::SI_SPILL_AV224_RESTORE;
1839 case 32:
1840 return AMDGPU::SI_SPILL_AV256_RESTORE;
1841 case 36:
1842 return AMDGPU::SI_SPILL_AV288_RESTORE;
1843 case 40:
1844 return AMDGPU::SI_SPILL_AV320_RESTORE;
1845 case 44:
1846 return AMDGPU::SI_SPILL_AV352_RESTORE;
1847 case 48:
1848 return AMDGPU::SI_SPILL_AV384_RESTORE;
1849 case 64:
1850 return AMDGPU::SI_SPILL_AV512_RESTORE;
1851 case 128:
1852 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1853 default:
1854 llvm_unreachable("unknown register size");
1855 }
1856}
1857
1858static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1859 bool IsVectorSuperClass) {
1860 // Currently, there is only 32-bit WWM register spills needed.
1861 if (Size != 4)
1862 llvm_unreachable("unknown wwm register spill size");
1863
1864 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1865 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1866
1867 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1868}
1869
1871 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1872 const SIMachineFunctionInfo &MFI) const {
1873 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1874
1875 // Choose the right opcode if restoring a WWM register.
1877 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1878
1879 // TODO: Check if AGPRs are available
1880 if (ST.hasMAIInsts())
1882
1883 assert(!RI.isAGPRClass(RC));
1885}
1886
1889 Register DestReg, int FrameIndex,
1890 const TargetRegisterClass *RC,
1891 const TargetRegisterInfo *TRI,
1892 Register VReg,
1893 MachineInstr::MIFlag Flags) const {
1894 MachineFunction *MF = MBB.getParent();
1896 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1897 const DebugLoc &DL = MBB.findDebugLoc(MI);
1898 unsigned SpillSize = TRI->getSpillSize(*RC);
1899
1900 MachinePointerInfo PtrInfo
1901 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1902
1904 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1905 FrameInfo.getObjectAlign(FrameIndex));
1906
1907 if (RI.isSGPRClass(RC)) {
1908 MFI->setHasSpilledSGPRs();
1909 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1910 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1911 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1912
1913 // FIXME: Maybe this should not include a memoperand because it will be
1914 // lowered to non-memory instructions.
1915 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1916 if (DestReg.isVirtual() && SpillSize == 4) {
1918 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1919 }
1920
1921 if (RI.spillSGPRToVGPR())
1922 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1923 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1924 .addFrameIndex(FrameIndex) // addr
1925 .addMemOperand(MMO)
1927
1928 return;
1929 }
1930
1931 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1932 SpillSize, *MFI);
1933 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1934 .addFrameIndex(FrameIndex) // vaddr
1935 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1936 .addImm(0) // offset
1937 .addMemOperand(MMO);
1938}
1939
1944
1947 unsigned Quantity) const {
1948 DebugLoc DL = MBB.findDebugLoc(MI);
1949 while (Quantity > 0) {
1950 unsigned Arg = std::min(Quantity, 8u);
1951 Quantity -= Arg;
1952 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1953 }
1954}
1955
1957 auto *MF = MBB.getParent();
1958 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1959
1960 assert(Info->isEntryFunction());
1961
1962 if (MBB.succ_empty()) {
1963 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1964 if (HasNoTerminator) {
1965 if (Info->returnsVoid()) {
1966 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1967 } else {
1968 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1969 }
1970 }
1971 }
1972}
1973
1977 const DebugLoc &DL) const {
1978 MachineFunction *MF = MBB.getParent();
1979 constexpr unsigned DoorbellIDMask = 0x3ff;
1980 constexpr unsigned ECQueueWaveAbort = 0x400;
1981
1982 MachineBasicBlock *TrapBB = &MBB;
1983 MachineBasicBlock *ContBB = &MBB;
1984 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1985
1986 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1987 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1988 TrapBB = MF->CreateMachineBasicBlock();
1989 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1990 MF->push_back(TrapBB);
1991 MBB.addSuccessor(TrapBB);
1992 }
1993
1994 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1995 // will be a nop.
1996 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1997 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1998 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1999 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2000 DoorbellReg)
2002 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2003 .addUse(AMDGPU::M0);
2004 Register DoorbellRegMasked =
2005 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2006 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2007 .addUse(DoorbellReg)
2008 .addImm(DoorbellIDMask);
2009 Register SetWaveAbortBit =
2010 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2011 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2012 .addUse(DoorbellRegMasked)
2013 .addImm(ECQueueWaveAbort);
2014 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2015 .addUse(SetWaveAbortBit);
2016 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2018 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2019 .addUse(AMDGPU::TTMP2);
2020 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2021 TrapBB->addSuccessor(HaltLoopBB);
2022
2023 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2024 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2025 .addMBB(HaltLoopBB);
2026 MF->push_back(HaltLoopBB);
2027 HaltLoopBB->addSuccessor(HaltLoopBB);
2028
2029 return ContBB;
2030}
2031
2033 switch (MI.getOpcode()) {
2034 default:
2035 if (MI.isMetaInstruction())
2036 return 0;
2037 return 1; // FIXME: Do wait states equal cycles?
2038
2039 case AMDGPU::S_NOP:
2040 return MI.getOperand(0).getImm() + 1;
2041 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2042 // hazard, even if one exist, won't really be visible. Should we handle it?
2043 }
2044}
2045
2047 MachineBasicBlock &MBB = *MI.getParent();
2048 DebugLoc DL = MBB.findDebugLoc(MI);
2049 switch (MI.getOpcode()) {
2050 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2051 case AMDGPU::S_MOV_B64_term:
2052 // This is only a terminator to get the correct spill code placement during
2053 // register allocation.
2054 MI.setDesc(get(AMDGPU::S_MOV_B64));
2055 break;
2056
2057 case AMDGPU::S_MOV_B32_term:
2058 // This is only a terminator to get the correct spill code placement during
2059 // register allocation.
2060 MI.setDesc(get(AMDGPU::S_MOV_B32));
2061 break;
2062
2063 case AMDGPU::S_XOR_B64_term:
2064 // This is only a terminator to get the correct spill code placement during
2065 // register allocation.
2066 MI.setDesc(get(AMDGPU::S_XOR_B64));
2067 break;
2068
2069 case AMDGPU::S_XOR_B32_term:
2070 // This is only a terminator to get the correct spill code placement during
2071 // register allocation.
2072 MI.setDesc(get(AMDGPU::S_XOR_B32));
2073 break;
2074 case AMDGPU::S_OR_B64_term:
2075 // This is only a terminator to get the correct spill code placement during
2076 // register allocation.
2077 MI.setDesc(get(AMDGPU::S_OR_B64));
2078 break;
2079 case AMDGPU::S_OR_B32_term:
2080 // This is only a terminator to get the correct spill code placement during
2081 // register allocation.
2082 MI.setDesc(get(AMDGPU::S_OR_B32));
2083 break;
2084
2085 case AMDGPU::S_ANDN2_B64_term:
2086 // This is only a terminator to get the correct spill code placement during
2087 // register allocation.
2088 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2089 break;
2090
2091 case AMDGPU::S_ANDN2_B32_term:
2092 // This is only a terminator to get the correct spill code placement during
2093 // register allocation.
2094 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2095 break;
2096
2097 case AMDGPU::S_AND_B64_term:
2098 // This is only a terminator to get the correct spill code placement during
2099 // register allocation.
2100 MI.setDesc(get(AMDGPU::S_AND_B64));
2101 break;
2102
2103 case AMDGPU::S_AND_B32_term:
2104 // This is only a terminator to get the correct spill code placement during
2105 // register allocation.
2106 MI.setDesc(get(AMDGPU::S_AND_B32));
2107 break;
2108
2109 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2110 // This is only a terminator to get the correct spill code placement during
2111 // register allocation.
2112 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2113 break;
2114
2115 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2116 // This is only a terminator to get the correct spill code placement during
2117 // register allocation.
2118 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2119 break;
2120
2121 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2122 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2123 break;
2124
2125 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2126 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2127 MI.getMF()->getRegInfo().constrainRegClass(MI.getOperand(0).getReg(),
2128 &AMDGPU::SReg_32_XM0RegClass);
2129 break;
2130 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2131 Register Dst = MI.getOperand(0).getReg();
2132 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2133 MI.setDesc(
2134 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2135 break;
2136 }
2137 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2138 Register Dst = MI.getOperand(0).getReg();
2139 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2140 int64_t Imm = MI.getOperand(1).getImm();
2141
2142 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2143 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2144 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2147 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2148 .addImm(SignExtend64<32>(Imm >> 32))
2150 MI.eraseFromParent();
2151 break;
2152 }
2153
2154 [[fallthrough]];
2155 }
2156 case AMDGPU::V_MOV_B64_PSEUDO: {
2157 Register Dst = MI.getOperand(0).getReg();
2158 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2159 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2160
2161 const MachineOperand &SrcOp = MI.getOperand(1);
2162 // FIXME: Will this work for 64-bit floating point immediates?
2163 assert(!SrcOp.isFPImm());
2164 if (ST.hasMovB64()) {
2165 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2166 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2167 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2168 break;
2169 }
2170 if (SrcOp.isImm()) {
2171 APInt Imm(64, SrcOp.getImm());
2172 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2173 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2174 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2175 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2177 .addImm(Lo.getSExtValue())
2179 .addImm(Lo.getSExtValue())
2180 .addImm(0) // op_sel_lo
2181 .addImm(0) // op_sel_hi
2182 .addImm(0) // neg_lo
2183 .addImm(0) // neg_hi
2184 .addImm(0); // clamp
2185 } else {
2186 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2187 .addImm(Lo.getSExtValue())
2189 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2190 .addImm(Hi.getSExtValue())
2192 }
2193 } else {
2194 assert(SrcOp.isReg());
2195 if (ST.hasPkMovB32() &&
2196 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2197 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2198 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2199 .addReg(SrcOp.getReg())
2201 .addReg(SrcOp.getReg())
2202 .addImm(0) // op_sel_lo
2203 .addImm(0) // op_sel_hi
2204 .addImm(0) // neg_lo
2205 .addImm(0) // neg_hi
2206 .addImm(0); // clamp
2207 } else {
2208 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2209 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2211 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2212 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2214 }
2215 }
2216 MI.eraseFromParent();
2217 break;
2218 }
2219 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2221 break;
2222 }
2223 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2224 const MachineOperand &SrcOp = MI.getOperand(1);
2225 assert(!SrcOp.isFPImm());
2226
2227 if (ST.has64BitLiterals()) {
2228 MI.setDesc(get(AMDGPU::S_MOV_B64));
2229 break;
2230 }
2231
2232 APInt Imm(64, SrcOp.getImm());
2233 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2234 MI.setDesc(get(AMDGPU::S_MOV_B64));
2235 break;
2236 }
2237
2238 Register Dst = MI.getOperand(0).getReg();
2239 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2240 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2241
2242 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2243 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2244 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2245 .addImm(Lo.getSExtValue())
2247 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2248 .addImm(Hi.getSExtValue())
2250 MI.eraseFromParent();
2251 break;
2252 }
2253 case AMDGPU::V_SET_INACTIVE_B32: {
2254 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2255 Register DstReg = MI.getOperand(0).getReg();
2256 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2257 .add(MI.getOperand(3))
2258 .add(MI.getOperand(4))
2259 .add(MI.getOperand(1))
2260 .add(MI.getOperand(2))
2261 .add(MI.getOperand(5));
2262 MI.eraseFromParent();
2263 break;
2264 }
2265 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2266 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2267 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2268 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2269 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2270 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2271 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2272 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2273 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2274 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2275 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2276 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2277 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2278 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2279 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2280 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2281 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2282 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2283 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2284 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2285 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2286 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2287 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2288 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2289 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2290 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2291 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2292 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2293 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2294 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2295
2296 unsigned Opc;
2297 if (RI.hasVGPRs(EltRC)) {
2298 Opc = AMDGPU::V_MOVRELD_B32_e32;
2299 } else {
2300 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2301 : AMDGPU::S_MOVRELD_B32;
2302 }
2303
2304 const MCInstrDesc &OpDesc = get(Opc);
2305 Register VecReg = MI.getOperand(0).getReg();
2306 bool IsUndef = MI.getOperand(1).isUndef();
2307 unsigned SubReg = MI.getOperand(3).getImm();
2308 assert(VecReg == MI.getOperand(1).getReg());
2309
2311 BuildMI(MBB, MI, DL, OpDesc)
2312 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2313 .add(MI.getOperand(2))
2315 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2316
2317 const int ImpDefIdx =
2318 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2319 const int ImpUseIdx = ImpDefIdx + 1;
2320 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2321 MI.eraseFromParent();
2322 break;
2323 }
2324 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2325 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2326 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2327 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2328 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2329 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2330 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2331 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2332 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2333 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2334 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2335 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2336 assert(ST.useVGPRIndexMode());
2337 Register VecReg = MI.getOperand(0).getReg();
2338 bool IsUndef = MI.getOperand(1).isUndef();
2339 MachineOperand &Idx = MI.getOperand(3);
2340 Register SubReg = MI.getOperand(4).getImm();
2341
2342 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2343 .add(Idx)
2345 SetOn->getOperand(3).setIsUndef();
2346
2347 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2349 BuildMI(MBB, MI, DL, OpDesc)
2350 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2351 .add(MI.getOperand(2))
2353 .addReg(VecReg,
2354 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2355
2356 const int ImpDefIdx =
2357 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2358 const int ImpUseIdx = ImpDefIdx + 1;
2359 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2360
2361 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2362
2363 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2364
2365 MI.eraseFromParent();
2366 break;
2367 }
2368 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2369 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2370 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2371 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2372 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2373 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2374 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2375 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2376 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2377 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2378 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2379 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2380 assert(ST.useVGPRIndexMode());
2381 Register Dst = MI.getOperand(0).getReg();
2382 Register VecReg = MI.getOperand(1).getReg();
2383 bool IsUndef = MI.getOperand(1).isUndef();
2384 Register Idx = MI.getOperand(2).getReg();
2385 Register SubReg = MI.getOperand(3).getImm();
2386
2387 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2388 .addReg(Idx)
2390 SetOn->getOperand(3).setIsUndef();
2391
2392 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2393 .addDef(Dst)
2394 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2395 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2396
2397 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2398
2399 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2400
2401 MI.eraseFromParent();
2402 break;
2403 }
2404 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2405 MachineFunction &MF = *MBB.getParent();
2406 Register Reg = MI.getOperand(0).getReg();
2407 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2408 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2409 MachineOperand OpLo = MI.getOperand(1);
2410 MachineOperand OpHi = MI.getOperand(2);
2411
2412 // Create a bundle so these instructions won't be re-ordered by the
2413 // post-RA scheduler.
2414 MIBundleBuilder Bundler(MBB, MI);
2415 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2416
2417 // What we want here is an offset from the value returned by s_getpc (which
2418 // is the address of the s_add_u32 instruction) to the global variable, but
2419 // since the encoding of $symbol starts 4 bytes after the start of the
2420 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2421 // small. This requires us to add 4 to the global variable offset in order
2422 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2423 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2424 // instruction.
2425
2426 int64_t Adjust = 0;
2427 if (ST.hasGetPCZeroExtension()) {
2428 // Fix up hardware that does not sign-extend the 48-bit PC value by
2429 // inserting: s_sext_i32_i16 reghi, reghi
2430 Bundler.append(
2431 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2432 Adjust += 4;
2433 }
2434
2435 if (OpLo.isGlobal())
2436 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2437 Bundler.append(
2438 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2439
2440 if (OpHi.isGlobal())
2441 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2442 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2443 .addReg(RegHi)
2444 .add(OpHi));
2445
2446 finalizeBundle(MBB, Bundler.begin());
2447
2448 MI.eraseFromParent();
2449 break;
2450 }
2451 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2452 MachineFunction &MF = *MBB.getParent();
2453 Register Reg = MI.getOperand(0).getReg();
2454 MachineOperand Op = MI.getOperand(1);
2455
2456 // Create a bundle so these instructions won't be re-ordered by the
2457 // post-RA scheduler.
2458 MIBundleBuilder Bundler(MBB, MI);
2459 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2460 if (Op.isGlobal())
2461 Op.setOffset(Op.getOffset() + 4);
2462 Bundler.append(
2463 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2464
2465 finalizeBundle(MBB, Bundler.begin());
2466
2467 MI.eraseFromParent();
2468 break;
2469 }
2470 case AMDGPU::ENTER_STRICT_WWM: {
2471 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2472 // Whole Wave Mode is entered.
2473 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2474 : AMDGPU::S_OR_SAVEEXEC_B64));
2475 break;
2476 }
2477 case AMDGPU::ENTER_STRICT_WQM: {
2478 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2479 // STRICT_WQM is entered.
2480 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2481 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2482 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2483 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2484 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2485
2486 MI.eraseFromParent();
2487 break;
2488 }
2489 case AMDGPU::EXIT_STRICT_WWM:
2490 case AMDGPU::EXIT_STRICT_WQM: {
2491 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2492 // WWM/STICT_WQM is exited.
2493 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2494 break;
2495 }
2496 case AMDGPU::SI_RETURN: {
2497 const MachineFunction *MF = MBB.getParent();
2498 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2499 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2500 // Hiding the return address use with SI_RETURN may lead to extra kills in
2501 // the function and missing live-ins. We are fine in practice because callee
2502 // saved register handling ensures the register value is restored before
2503 // RET, but we need the undef flag here to appease the MachineVerifier
2504 // liveness checks.
2506 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2507 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2508
2509 MIB.copyImplicitOps(MI);
2510 MI.eraseFromParent();
2511 break;
2512 }
2513
2514 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2515 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2516 MI.setDesc(get(AMDGPU::S_MUL_U64));
2517 break;
2518
2519 case AMDGPU::S_GETPC_B64_pseudo:
2520 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2521 if (ST.hasGetPCZeroExtension()) {
2522 Register Dst = MI.getOperand(0).getReg();
2523 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2524 // Fix up hardware that does not sign-extend the 48-bit PC value by
2525 // inserting: s_sext_i32_i16 dsthi, dsthi
2526 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2527 DstHi)
2528 .addReg(DstHi);
2529 }
2530 break;
2531
2532 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2533 assert(ST.hasBF16PackedInsts());
2534 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2535 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2536 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2537 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2538 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2539 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2540 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2541 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2542 break;
2543 }
2544
2545 return true;
2546}
2547
2550 unsigned SubIdx, const MachineInstr &Orig,
2551 const TargetRegisterInfo &RI) const {
2552
2553 // Try shrinking the instruction to remat only the part needed for current
2554 // context.
2555 // TODO: Handle more cases.
2556 unsigned Opcode = Orig.getOpcode();
2557 switch (Opcode) {
2558 case AMDGPU::S_LOAD_DWORDX16_IMM:
2559 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2560 if (SubIdx != 0)
2561 break;
2562
2563 if (I == MBB.end())
2564 break;
2565
2566 if (I->isBundled())
2567 break;
2568
2569 // Look for a single use of the register that is also a subreg.
2570 Register RegToFind = Orig.getOperand(0).getReg();
2571 MachineOperand *UseMO = nullptr;
2572 for (auto &CandMO : I->operands()) {
2573 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2574 continue;
2575 if (UseMO) {
2576 UseMO = nullptr;
2577 break;
2578 }
2579 UseMO = &CandMO;
2580 }
2581 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2582 break;
2583
2584 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2585 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2586
2587 MachineFunction *MF = MBB.getParent();
2589 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2590
2591 unsigned NewOpcode = -1;
2592 if (SubregSize == 256)
2593 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2594 else if (SubregSize == 128)
2595 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2596 else
2597 break;
2598
2599 const MCInstrDesc &TID = get(NewOpcode);
2600 const TargetRegisterClass *NewRC =
2601 RI.getAllocatableClass(getRegClass(TID, 0, &RI));
2602 MRI.setRegClass(DestReg, NewRC);
2603
2604 UseMO->setReg(DestReg);
2605 UseMO->setSubReg(AMDGPU::NoSubRegister);
2606
2607 // Use a smaller load with the desired size, possibly with updated offset.
2608 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2609 MI->setDesc(TID);
2610 MI->getOperand(0).setReg(DestReg);
2611 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2612 if (Offset) {
2613 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2614 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2615 OffsetMO->setImm(FinalOffset);
2616 }
2618 for (const MachineMemOperand *MemOp : Orig.memoperands())
2619 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2620 SubregSize / 8));
2621 MI->setMemRefs(*MF, NewMMOs);
2622
2623 MBB.insert(I, MI);
2624 return;
2625 }
2626
2627 default:
2628 break;
2629 }
2630
2631 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2632}
2633
2634std::pair<MachineInstr*, MachineInstr*>
2636 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2637
2638 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2640 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2641 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2642 return std::pair(&MI, nullptr);
2643 }
2644
2645 MachineBasicBlock &MBB = *MI.getParent();
2646 DebugLoc DL = MBB.findDebugLoc(MI);
2647 MachineFunction *MF = MBB.getParent();
2649 Register Dst = MI.getOperand(0).getReg();
2650 unsigned Part = 0;
2651 MachineInstr *Split[2];
2652
2653 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2654 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2655 if (Dst.isPhysical()) {
2656 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2657 } else {
2658 assert(MRI.isSSA());
2659 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2660 MovDPP.addDef(Tmp);
2661 }
2662
2663 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2664 const MachineOperand &SrcOp = MI.getOperand(I);
2665 assert(!SrcOp.isFPImm());
2666 if (SrcOp.isImm()) {
2667 APInt Imm(64, SrcOp.getImm());
2668 Imm.ashrInPlace(Part * 32);
2669 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2670 } else {
2671 assert(SrcOp.isReg());
2672 Register Src = SrcOp.getReg();
2673 if (Src.isPhysical())
2674 MovDPP.addReg(RI.getSubReg(Src, Sub));
2675 else
2676 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2677 }
2678 }
2679
2680 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2681 MovDPP.addImm(MO.getImm());
2682
2683 Split[Part] = MovDPP;
2684 ++Part;
2685 }
2686
2687 if (Dst.isVirtual())
2688 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2689 .addReg(Split[0]->getOperand(0).getReg())
2690 .addImm(AMDGPU::sub0)
2691 .addReg(Split[1]->getOperand(0).getReg())
2692 .addImm(AMDGPU::sub1);
2693
2694 MI.eraseFromParent();
2695 return std::pair(Split[0], Split[1]);
2696}
2697
2698std::optional<DestSourcePair>
2700 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2701 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2702
2703 return std::nullopt;
2704}
2705
2707 AMDGPU::OpName Src0OpName,
2708 MachineOperand &Src1,
2709 AMDGPU::OpName Src1OpName) const {
2710 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2711 if (!Src0Mods)
2712 return false;
2713
2714 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2715 assert(Src1Mods &&
2716 "All commutable instructions have both src0 and src1 modifiers");
2717
2718 int Src0ModsVal = Src0Mods->getImm();
2719 int Src1ModsVal = Src1Mods->getImm();
2720
2721 Src1Mods->setImm(Src0ModsVal);
2722 Src0Mods->setImm(Src1ModsVal);
2723 return true;
2724}
2725
2727 MachineOperand &RegOp,
2728 MachineOperand &NonRegOp) {
2729 Register Reg = RegOp.getReg();
2730 unsigned SubReg = RegOp.getSubReg();
2731 bool IsKill = RegOp.isKill();
2732 bool IsDead = RegOp.isDead();
2733 bool IsUndef = RegOp.isUndef();
2734 bool IsDebug = RegOp.isDebug();
2735
2736 if (NonRegOp.isImm())
2737 RegOp.ChangeToImmediate(NonRegOp.getImm());
2738 else if (NonRegOp.isFI())
2739 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2740 else if (NonRegOp.isGlobal()) {
2741 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2742 NonRegOp.getTargetFlags());
2743 } else
2744 return nullptr;
2745
2746 // Make sure we don't reinterpret a subreg index in the target flags.
2747 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2748
2749 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2750 NonRegOp.setSubReg(SubReg);
2751
2752 return &MI;
2753}
2754
2756 MachineOperand &NonRegOp1,
2757 MachineOperand &NonRegOp2) {
2758 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2759 int64_t NonRegVal = NonRegOp1.getImm();
2760
2761 NonRegOp1.setImm(NonRegOp2.getImm());
2762 NonRegOp2.setImm(NonRegVal);
2763 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2764 NonRegOp2.setTargetFlags(TargetFlags);
2765 return &MI;
2766}
2767
2768bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2769 unsigned OpIdx1) const {
2770 const MCInstrDesc &InstDesc = MI.getDesc();
2771 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2772 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2773
2774 unsigned Opc = MI.getOpcode();
2775 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2776
2777 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2778 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2779
2780 // Swap doesn't breach constant bus or literal limits
2781 // It may move literal to position other than src0, this is not allowed
2782 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2783 // FIXME: After gfx9, literal can be in place other than Src0
2784 if (isVALU(MI)) {
2785 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2786 !isInlineConstant(MO0, OpInfo1))
2787 return false;
2788 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2789 !isInlineConstant(MO1, OpInfo0))
2790 return false;
2791 }
2792
2793 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2794 if (OpInfo1.RegClass == -1)
2795 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2796 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2797 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2798 }
2799 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2800 if (OpInfo0.RegClass == -1)
2801 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2802 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2803 isLegalRegOperand(MI, OpIdx0, MO1);
2804 }
2805
2806 // No need to check 64-bit literals since swapping does not bring new
2807 // 64-bit literals into current instruction to fold to 32-bit
2808
2809 return isImmOperandLegal(MI, OpIdx1, MO0);
2810}
2811
2813 unsigned Src0Idx,
2814 unsigned Src1Idx) const {
2815 assert(!NewMI && "this should never be used");
2816
2817 unsigned Opc = MI.getOpcode();
2818 int CommutedOpcode = commuteOpcode(Opc);
2819 if (CommutedOpcode == -1)
2820 return nullptr;
2821
2822 if (Src0Idx > Src1Idx)
2823 std::swap(Src0Idx, Src1Idx);
2824
2825 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2826 static_cast<int>(Src0Idx) &&
2827 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2828 static_cast<int>(Src1Idx) &&
2829 "inconsistency with findCommutedOpIndices");
2830
2831 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2832 return nullptr;
2833
2834 MachineInstr *CommutedMI = nullptr;
2835 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2836 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2837 if (Src0.isReg() && Src1.isReg()) {
2838 // Be sure to copy the source modifiers to the right place.
2839 CommutedMI =
2840 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2841 } else if (Src0.isReg() && !Src1.isReg()) {
2842 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2843 } else if (!Src0.isReg() && Src1.isReg()) {
2844 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2845 } else if (Src0.isImm() && Src1.isImm()) {
2846 CommutedMI = swapImmOperands(MI, Src0, Src1);
2847 } else {
2848 // FIXME: Found two non registers to commute. This does happen.
2849 return nullptr;
2850 }
2851
2852 if (CommutedMI) {
2853 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2854 Src1, AMDGPU::OpName::src1_modifiers);
2855
2856 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2857 AMDGPU::OpName::src1_sel);
2858
2859 CommutedMI->setDesc(get(CommutedOpcode));
2860 }
2861
2862 return CommutedMI;
2863}
2864
2865// This needs to be implemented because the source modifiers may be inserted
2866// between the true commutable operands, and the base
2867// TargetInstrInfo::commuteInstruction uses it.
2869 unsigned &SrcOpIdx0,
2870 unsigned &SrcOpIdx1) const {
2871 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2872}
2873
2875 unsigned &SrcOpIdx0,
2876 unsigned &SrcOpIdx1) const {
2877 if (!Desc.isCommutable())
2878 return false;
2879
2880 unsigned Opc = Desc.getOpcode();
2881 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2882 if (Src0Idx == -1)
2883 return false;
2884
2885 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2886 if (Src1Idx == -1)
2887 return false;
2888
2889 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2890}
2891
2893 int64_t BrOffset) const {
2894 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2895 // because its dest block is unanalyzable.
2896 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2897
2898 // Convert to dwords.
2899 BrOffset /= 4;
2900
2901 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2902 // from the next instruction.
2903 BrOffset -= 1;
2904
2905 return isIntN(BranchOffsetBits, BrOffset);
2906}
2907
2910 return MI.getOperand(0).getMBB();
2911}
2912
2914 for (const MachineInstr &MI : MBB->terminators()) {
2915 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2916 MI.getOpcode() == AMDGPU::SI_LOOP)
2917 return true;
2918 }
2919 return false;
2920}
2921
2923 MachineBasicBlock &DestBB,
2924 MachineBasicBlock &RestoreBB,
2925 const DebugLoc &DL, int64_t BrOffset,
2926 RegScavenger *RS) const {
2927 assert(MBB.empty() &&
2928 "new block should be inserted for expanding unconditional branch");
2929 assert(MBB.pred_size() == 1);
2930 assert(RestoreBB.empty() &&
2931 "restore block should be inserted for restoring clobbered registers");
2932
2933 MachineFunction *MF = MBB.getParent();
2936 auto I = MBB.end();
2937 auto &MCCtx = MF->getContext();
2938
2939 if (ST.hasAddPC64Inst()) {
2940 MCSymbol *Offset =
2941 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2942 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2944 MCSymbol *PostAddPCLabel =
2945 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2946 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2947 auto *OffsetExpr = MCBinaryExpr::createSub(
2948 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2949 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2950 Offset->setVariableValue(OffsetExpr);
2951 return;
2952 }
2953
2954 assert(RS && "RegScavenger required for long branching");
2955
2956 // FIXME: Virtual register workaround for RegScavenger not working with empty
2957 // blocks.
2958 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2959
2960 // Note: as this is used after hazard recognizer we need to apply some hazard
2961 // workarounds directly.
2962 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2963 ST.hasVALUReadSGPRHazard();
2964 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2965 if (FlushSGPRWrites)
2966 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2968 };
2969
2970 // We need to compute the offset relative to the instruction immediately after
2971 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2972 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2973 ApplyHazardWorkarounds();
2974
2975 MCSymbol *PostGetPCLabel =
2976 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2977 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2978
2979 MCSymbol *OffsetLo =
2980 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2981 MCSymbol *OffsetHi =
2982 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2983 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2984 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2985 .addReg(PCReg, 0, AMDGPU::sub0)
2986 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2987 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2988 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2989 .addReg(PCReg, 0, AMDGPU::sub1)
2990 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2991 ApplyHazardWorkarounds();
2992
2993 // Insert the indirect branch after the other terminator.
2994 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2995 .addReg(PCReg);
2996
2997 // If a spill is needed for the pc register pair, we need to insert a spill
2998 // restore block right before the destination block, and insert a short branch
2999 // into the old destination block's fallthrough predecessor.
3000 // e.g.:
3001 //
3002 // s_cbranch_scc0 skip_long_branch:
3003 //
3004 // long_branch_bb:
3005 // spill s[8:9]
3006 // s_getpc_b64 s[8:9]
3007 // s_add_u32 s8, s8, restore_bb
3008 // s_addc_u32 s9, s9, 0
3009 // s_setpc_b64 s[8:9]
3010 //
3011 // skip_long_branch:
3012 // foo;
3013 //
3014 // .....
3015 //
3016 // dest_bb_fallthrough_predecessor:
3017 // bar;
3018 // s_branch dest_bb
3019 //
3020 // restore_bb:
3021 // restore s[8:9]
3022 // fallthrough dest_bb
3023 ///
3024 // dest_bb:
3025 // buzz;
3026
3027 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3028 Register Scav;
3029
3030 // If we've previously reserved a register for long branches
3031 // avoid running the scavenger and just use those registers
3032 if (LongBranchReservedReg) {
3033 RS->enterBasicBlock(MBB);
3034 Scav = LongBranchReservedReg;
3035 } else {
3037 Scav = RS->scavengeRegisterBackwards(
3038 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3039 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3040 }
3041 if (Scav) {
3042 RS->setRegUsed(Scav);
3043 MRI.replaceRegWith(PCReg, Scav);
3044 MRI.clearVirtRegs();
3045 } else {
3046 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3047 // SGPR spill.
3048 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3049 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3050 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3051 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3052 MRI.clearVirtRegs();
3053 }
3054
3055 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3056 // Now, the distance could be defined.
3058 MCSymbolRefExpr::create(DestLabel, MCCtx),
3059 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3060 // Add offset assignments.
3061 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3062 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3063 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3064 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3065}
3066
3067unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3068 switch (Cond) {
3069 case SIInstrInfo::SCC_TRUE:
3070 return AMDGPU::S_CBRANCH_SCC1;
3071 case SIInstrInfo::SCC_FALSE:
3072 return AMDGPU::S_CBRANCH_SCC0;
3073 case SIInstrInfo::VCCNZ:
3074 return AMDGPU::S_CBRANCH_VCCNZ;
3075 case SIInstrInfo::VCCZ:
3076 return AMDGPU::S_CBRANCH_VCCZ;
3077 case SIInstrInfo::EXECNZ:
3078 return AMDGPU::S_CBRANCH_EXECNZ;
3079 case SIInstrInfo::EXECZ:
3080 return AMDGPU::S_CBRANCH_EXECZ;
3081 default:
3082 llvm_unreachable("invalid branch predicate");
3083 }
3084}
3085
3086SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3087 switch (Opcode) {
3088 case AMDGPU::S_CBRANCH_SCC0:
3089 return SCC_FALSE;
3090 case AMDGPU::S_CBRANCH_SCC1:
3091 return SCC_TRUE;
3092 case AMDGPU::S_CBRANCH_VCCNZ:
3093 return VCCNZ;
3094 case AMDGPU::S_CBRANCH_VCCZ:
3095 return VCCZ;
3096 case AMDGPU::S_CBRANCH_EXECNZ:
3097 return EXECNZ;
3098 case AMDGPU::S_CBRANCH_EXECZ:
3099 return EXECZ;
3100 default:
3101 return INVALID_BR;
3102 }
3103}
3104
3108 MachineBasicBlock *&FBB,
3110 bool AllowModify) const {
3111 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3112 // Unconditional Branch
3113 TBB = I->getOperand(0).getMBB();
3114 return false;
3115 }
3116
3117 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3118 if (Pred == INVALID_BR)
3119 return true;
3120
3121 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3122 Cond.push_back(MachineOperand::CreateImm(Pred));
3123 Cond.push_back(I->getOperand(1)); // Save the branch register.
3124
3125 ++I;
3126
3127 if (I == MBB.end()) {
3128 // Conditional branch followed by fall-through.
3129 TBB = CondBB;
3130 return false;
3131 }
3132
3133 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3134 TBB = CondBB;
3135 FBB = I->getOperand(0).getMBB();
3136 return false;
3137 }
3138
3139 return true;
3140}
3141
3143 MachineBasicBlock *&FBB,
3145 bool AllowModify) const {
3146 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3147 auto E = MBB.end();
3148 if (I == E)
3149 return false;
3150
3151 // Skip over the instructions that are artificially terminators for special
3152 // exec management.
3153 while (I != E && !I->isBranch() && !I->isReturn()) {
3154 switch (I->getOpcode()) {
3155 case AMDGPU::S_MOV_B64_term:
3156 case AMDGPU::S_XOR_B64_term:
3157 case AMDGPU::S_OR_B64_term:
3158 case AMDGPU::S_ANDN2_B64_term:
3159 case AMDGPU::S_AND_B64_term:
3160 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3161 case AMDGPU::S_MOV_B32_term:
3162 case AMDGPU::S_XOR_B32_term:
3163 case AMDGPU::S_OR_B32_term:
3164 case AMDGPU::S_ANDN2_B32_term:
3165 case AMDGPU::S_AND_B32_term:
3166 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3167 break;
3168 case AMDGPU::SI_IF:
3169 case AMDGPU::SI_ELSE:
3170 case AMDGPU::SI_KILL_I1_TERMINATOR:
3171 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3172 // FIXME: It's messy that these need to be considered here at all.
3173 return true;
3174 default:
3175 llvm_unreachable("unexpected non-branch terminator inst");
3176 }
3177
3178 ++I;
3179 }
3180
3181 if (I == E)
3182 return false;
3183
3184 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3185}
3186
3188 int *BytesRemoved) const {
3189 unsigned Count = 0;
3190 unsigned RemovedSize = 0;
3191 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3192 // Skip over artificial terminators when removing instructions.
3193 if (MI.isBranch() || MI.isReturn()) {
3194 RemovedSize += getInstSizeInBytes(MI);
3195 MI.eraseFromParent();
3196 ++Count;
3197 }
3198 }
3199
3200 if (BytesRemoved)
3201 *BytesRemoved = RemovedSize;
3202
3203 return Count;
3204}
3205
3206// Copy the flags onto the implicit condition register operand.
3208 const MachineOperand &OrigCond) {
3209 CondReg.setIsUndef(OrigCond.isUndef());
3210 CondReg.setIsKill(OrigCond.isKill());
3211}
3212
3215 MachineBasicBlock *FBB,
3217 const DebugLoc &DL,
3218 int *BytesAdded) const {
3219 if (!FBB && Cond.empty()) {
3220 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3221 .addMBB(TBB);
3222 if (BytesAdded)
3223 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3224 return 1;
3225 }
3226
3227 assert(TBB && Cond[0].isImm());
3228
3229 unsigned Opcode
3230 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3231
3232 if (!FBB) {
3233 MachineInstr *CondBr =
3234 BuildMI(&MBB, DL, get(Opcode))
3235 .addMBB(TBB);
3236
3237 // Copy the flags onto the implicit condition register operand.
3238 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3239 fixImplicitOperands(*CondBr);
3240
3241 if (BytesAdded)
3242 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3243 return 1;
3244 }
3245
3246 assert(TBB && FBB);
3247
3248 MachineInstr *CondBr =
3249 BuildMI(&MBB, DL, get(Opcode))
3250 .addMBB(TBB);
3251 fixImplicitOperands(*CondBr);
3252 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3253 .addMBB(FBB);
3254
3255 MachineOperand &CondReg = CondBr->getOperand(1);
3256 CondReg.setIsUndef(Cond[1].isUndef());
3257 CondReg.setIsKill(Cond[1].isKill());
3258
3259 if (BytesAdded)
3260 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3261
3262 return 2;
3263}
3264
3267 if (Cond.size() != 2) {
3268 return true;
3269 }
3270
3271 if (Cond[0].isImm()) {
3272 Cond[0].setImm(-Cond[0].getImm());
3273 return false;
3274 }
3275
3276 return true;
3277}
3278
3281 Register DstReg, Register TrueReg,
3282 Register FalseReg, int &CondCycles,
3283 int &TrueCycles, int &FalseCycles) const {
3284 switch (Cond[0].getImm()) {
3285 case VCCNZ:
3286 case VCCZ: {
3287 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3288 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3289 if (MRI.getRegClass(FalseReg) != RC)
3290 return false;
3291
3292 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3293 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3294
3295 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3296 return RI.hasVGPRs(RC) && NumInsts <= 6;
3297 }
3298 case SCC_TRUE:
3299 case SCC_FALSE: {
3300 // FIXME: We could insert for VGPRs if we could replace the original compare
3301 // with a vector one.
3302 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3303 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3304 if (MRI.getRegClass(FalseReg) != RC)
3305 return false;
3306
3307 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3308
3309 // Multiples of 8 can do s_cselect_b64
3310 if (NumInsts % 2 == 0)
3311 NumInsts /= 2;
3312
3313 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3314 return RI.isSGPRClass(RC);
3315 }
3316 default:
3317 return false;
3318 }
3319}
3320
3324 Register TrueReg, Register FalseReg) const {
3325 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3326 if (Pred == VCCZ || Pred == SCC_FALSE) {
3327 Pred = static_cast<BranchPredicate>(-Pred);
3328 std::swap(TrueReg, FalseReg);
3329 }
3330
3331 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3332 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3333 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3334
3335 if (DstSize == 32) {
3337 if (Pred == SCC_TRUE) {
3338 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3339 .addReg(TrueReg)
3340 .addReg(FalseReg);
3341 } else {
3342 // Instruction's operands are backwards from what is expected.
3343 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3344 .addReg(FalseReg)
3345 .addReg(TrueReg);
3346 }
3347
3348 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3349 return;
3350 }
3351
3352 if (DstSize == 64 && Pred == SCC_TRUE) {
3354 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3355 .addReg(TrueReg)
3356 .addReg(FalseReg);
3357
3358 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3359 return;
3360 }
3361
3362 static const int16_t Sub0_15[] = {
3363 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3364 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3365 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3366 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3367 };
3368
3369 static const int16_t Sub0_15_64[] = {
3370 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3371 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3372 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3373 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3374 };
3375
3376 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3377 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3378 const int16_t *SubIndices = Sub0_15;
3379 int NElts = DstSize / 32;
3380
3381 // 64-bit select is only available for SALU.
3382 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3383 if (Pred == SCC_TRUE) {
3384 if (NElts % 2) {
3385 SelOp = AMDGPU::S_CSELECT_B32;
3386 EltRC = &AMDGPU::SGPR_32RegClass;
3387 } else {
3388 SelOp = AMDGPU::S_CSELECT_B64;
3389 EltRC = &AMDGPU::SGPR_64RegClass;
3390 SubIndices = Sub0_15_64;
3391 NElts /= 2;
3392 }
3393 }
3394
3396 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3397
3398 I = MIB->getIterator();
3399
3401 for (int Idx = 0; Idx != NElts; ++Idx) {
3402 Register DstElt = MRI.createVirtualRegister(EltRC);
3403 Regs.push_back(DstElt);
3404
3405 unsigned SubIdx = SubIndices[Idx];
3406
3408 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3409 Select =
3410 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3411 .addReg(FalseReg, 0, SubIdx)
3412 .addReg(TrueReg, 0, SubIdx);
3413 } else {
3414 Select =
3415 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3416 .addReg(TrueReg, 0, SubIdx)
3417 .addReg(FalseReg, 0, SubIdx);
3418 }
3419
3420 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3422
3423 MIB.addReg(DstElt)
3424 .addImm(SubIdx);
3425 }
3426}
3427
3429 switch (MI.getOpcode()) {
3430 case AMDGPU::V_MOV_B16_t16_e32:
3431 case AMDGPU::V_MOV_B16_t16_e64:
3432 case AMDGPU::V_MOV_B32_e32:
3433 case AMDGPU::V_MOV_B32_e64:
3434 case AMDGPU::V_MOV_B64_PSEUDO:
3435 case AMDGPU::V_MOV_B64_e32:
3436 case AMDGPU::V_MOV_B64_e64:
3437 case AMDGPU::S_MOV_B32:
3438 case AMDGPU::S_MOV_B64:
3439 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3440 case AMDGPU::COPY:
3441 case AMDGPU::WWM_COPY:
3442 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3443 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3444 case AMDGPU::V_ACCVGPR_MOV_B32:
3445 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3446 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3447 return true;
3448 default:
3449 return false;
3450 }
3451}
3452
3453static constexpr AMDGPU::OpName ModifierOpNames[] = {
3454 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3455 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3456 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3457
3459 unsigned Opc = MI.getOpcode();
3460 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3461 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3462 if (Idx >= 0)
3463 MI.removeOperand(Idx);
3464 }
3465}
3466
3467std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3468 unsigned SubRegIndex) {
3469 switch (SubRegIndex) {
3470 case AMDGPU::NoSubRegister:
3471 return Imm;
3472 case AMDGPU::sub0:
3473 return SignExtend64<32>(Imm);
3474 case AMDGPU::sub1:
3475 return SignExtend64<32>(Imm >> 32);
3476 case AMDGPU::lo16:
3477 return SignExtend64<16>(Imm);
3478 case AMDGPU::hi16:
3479 return SignExtend64<16>(Imm >> 16);
3480 case AMDGPU::sub1_lo16:
3481 return SignExtend64<16>(Imm >> 32);
3482 case AMDGPU::sub1_hi16:
3483 return SignExtend64<16>(Imm >> 48);
3484 default:
3485 return std::nullopt;
3486 }
3487
3488 llvm_unreachable("covered subregister switch");
3489}
3490
3491static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3492 switch (Opc) {
3493 case AMDGPU::V_MAC_F16_e32:
3494 case AMDGPU::V_MAC_F16_e64:
3495 case AMDGPU::V_MAD_F16_e64:
3496 return AMDGPU::V_MADAK_F16;
3497 case AMDGPU::V_MAC_F32_e32:
3498 case AMDGPU::V_MAC_F32_e64:
3499 case AMDGPU::V_MAD_F32_e64:
3500 return AMDGPU::V_MADAK_F32;
3501 case AMDGPU::V_FMAC_F32_e32:
3502 case AMDGPU::V_FMAC_F32_e64:
3503 case AMDGPU::V_FMA_F32_e64:
3504 return AMDGPU::V_FMAAK_F32;
3505 case AMDGPU::V_FMAC_F16_e32:
3506 case AMDGPU::V_FMAC_F16_e64:
3507 case AMDGPU::V_FMAC_F16_t16_e64:
3508 case AMDGPU::V_FMAC_F16_fake16_e64:
3509 case AMDGPU::V_FMA_F16_e64:
3510 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3511 ? AMDGPU::V_FMAAK_F16_t16
3512 : AMDGPU::V_FMAAK_F16_fake16
3513 : AMDGPU::V_FMAAK_F16;
3514 case AMDGPU::V_FMAC_F64_e32:
3515 case AMDGPU::V_FMAC_F64_e64:
3516 case AMDGPU::V_FMA_F64_e64:
3517 return AMDGPU::V_FMAAK_F64;
3518 default:
3519 llvm_unreachable("invalid instruction");
3520 }
3521}
3522
3523static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3524 switch (Opc) {
3525 case AMDGPU::V_MAC_F16_e32:
3526 case AMDGPU::V_MAC_F16_e64:
3527 case AMDGPU::V_MAD_F16_e64:
3528 return AMDGPU::V_MADMK_F16;
3529 case AMDGPU::V_MAC_F32_e32:
3530 case AMDGPU::V_MAC_F32_e64:
3531 case AMDGPU::V_MAD_F32_e64:
3532 return AMDGPU::V_MADMK_F32;
3533 case AMDGPU::V_FMAC_F32_e32:
3534 case AMDGPU::V_FMAC_F32_e64:
3535 case AMDGPU::V_FMA_F32_e64:
3536 return AMDGPU::V_FMAMK_F32;
3537 case AMDGPU::V_FMAC_F16_e32:
3538 case AMDGPU::V_FMAC_F16_e64:
3539 case AMDGPU::V_FMAC_F16_t16_e64:
3540 case AMDGPU::V_FMAC_F16_fake16_e64:
3541 case AMDGPU::V_FMA_F16_e64:
3542 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3543 ? AMDGPU::V_FMAMK_F16_t16
3544 : AMDGPU::V_FMAMK_F16_fake16
3545 : AMDGPU::V_FMAMK_F16;
3546 case AMDGPU::V_FMAC_F64_e32:
3547 case AMDGPU::V_FMAC_F64_e64:
3548 case AMDGPU::V_FMA_F64_e64:
3549 return AMDGPU::V_FMAMK_F64;
3550 default:
3551 llvm_unreachable("invalid instruction");
3552 }
3553}
3554
3556 Register Reg, MachineRegisterInfo *MRI) const {
3557 int64_t Imm;
3558 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3559 return false;
3560
3561 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3562
3563 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3564
3565 unsigned Opc = UseMI.getOpcode();
3566 if (Opc == AMDGPU::COPY) {
3567 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3568
3569 Register DstReg = UseMI.getOperand(0).getReg();
3570 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3571
3572 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3573
3574 if (HasMultipleUses) {
3575 // TODO: This should fold in more cases with multiple use, but we need to
3576 // more carefully consider what those uses are.
3577 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3578
3579 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3580 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3581 return false;
3582
3583 // Most of the time folding a 32-bit inline constant is free (though this
3584 // might not be true if we can't later fold it into a real user).
3585 //
3586 // FIXME: This isInlineConstant check is imprecise if
3587 // getConstValDefinedInReg handled the tricky non-mov cases.
3588 if (ImmDefSize == 32 &&
3590 return false;
3591 }
3592
3593 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3594 RI.getSubRegIdxSize(UseSubReg) == 16;
3595
3596 if (Is16Bit) {
3597 if (RI.hasVGPRs(DstRC))
3598 return false; // Do not clobber vgpr_hi16
3599
3600 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3601 return false;
3602 }
3603
3604 MachineFunction *MF = UseMI.getMF();
3605
3606 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3607 MCRegister MovDstPhysReg =
3608 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3609
3610 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3611
3612 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3613 for (unsigned MovOp :
3614 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3615 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3616 const MCInstrDesc &MovDesc = get(MovOp);
3617
3618 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI);
3619 if (Is16Bit) {
3620 // We just need to find a correctly sized register class, so the
3621 // subregister index compatibility doesn't matter since we're statically
3622 // extracting the immediate value.
3623 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3624 if (!MovDstRC)
3625 continue;
3626
3627 if (MovDstPhysReg) {
3628 // FIXME: We probably should not do this. If there is a live value in
3629 // the high half of the register, it will be corrupted.
3630 MovDstPhysReg =
3631 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3632 if (!MovDstPhysReg)
3633 continue;
3634 }
3635 }
3636
3637 // Result class isn't the right size, try the next instruction.
3638 if (MovDstPhysReg) {
3639 if (!MovDstRC->contains(MovDstPhysReg))
3640 return false;
3641 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3642 // TODO: This will be overly conservative in the case of 16-bit virtual
3643 // SGPRs. We could hack up the virtual register uses to use a compatible
3644 // 32-bit class.
3645 continue;
3646 }
3647
3648 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3649
3650 // Ensure the interpreted immediate value is a valid operand in the new
3651 // mov.
3652 //
3653 // FIXME: isImmOperandLegal should have form that doesn't require existing
3654 // MachineInstr or MachineOperand
3655 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3656 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3657 break;
3658
3659 NewOpc = MovOp;
3660 break;
3661 }
3662
3663 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3664 return false;
3665
3666 if (Is16Bit) {
3667 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3668 if (MovDstPhysReg)
3669 UseMI.getOperand(0).setReg(MovDstPhysReg);
3670 assert(UseMI.getOperand(1).getReg().isVirtual());
3671 }
3672
3673 const MCInstrDesc &NewMCID = get(NewOpc);
3674 UseMI.setDesc(NewMCID);
3675 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3676 UseMI.addImplicitDefUseOperands(*MF);
3677 return true;
3678 }
3679
3680 if (HasMultipleUses)
3681 return false;
3682
3683 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3684 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3685 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3686 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3687 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3688 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3689 Opc == AMDGPU::V_FMAC_F64_e64) {
3690 // Don't fold if we are using source or output modifiers. The new VOP2
3691 // instructions don't have them.
3693 return false;
3694
3695 // If this is a free constant, there's no reason to do this.
3696 // TODO: We could fold this here instead of letting SIFoldOperands do it
3697 // later.
3698 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3699
3700 // Any src operand can be used for the legality check.
3701 if (isInlineConstant(UseMI, Src0Idx, Imm))
3702 return false;
3703
3704 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3705
3706 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3707 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3708
3709 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3710 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3711 (Src1->isReg() && Src1->getReg() == Reg)) {
3712 MachineOperand *RegSrc =
3713 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3714 if (!RegSrc->isReg())
3715 return false;
3716 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3717 ST.getConstantBusLimit(Opc) < 2)
3718 return false;
3719
3720 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3721 return false;
3722
3723 // If src2 is also a literal constant then we have to choose which one to
3724 // fold. In general it is better to choose madak so that the other literal
3725 // can be materialized in an sgpr instead of a vgpr:
3726 // s_mov_b32 s0, literal
3727 // v_madak_f32 v0, s0, v0, literal
3728 // Instead of:
3729 // v_mov_b32 v1, literal
3730 // v_madmk_f32 v0, v0, literal, v1
3731 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3732 if (Def && Def->isMoveImmediate() &&
3733 !isInlineConstant(Def->getOperand(1)))
3734 return false;
3735
3736 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3737 if (pseudoToMCOpcode(NewOpc) == -1)
3738 return false;
3739
3740 // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3741 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3742 // restricting their register classes. For now just bail out.
3743 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3744 NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3745 return false;
3746
3747 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3748 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3749
3750 // FIXME: This would be a lot easier if we could return a new instruction
3751 // instead of having to modify in place.
3752
3753 Register SrcReg = RegSrc->getReg();
3754 unsigned SrcSubReg = RegSrc->getSubReg();
3755 Src0->setReg(SrcReg);
3756 Src0->setSubReg(SrcSubReg);
3757 Src0->setIsKill(RegSrc->isKill());
3758
3759 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3760 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3761 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3762 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3763 UseMI.untieRegOperand(
3764 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3765
3766 Src1->ChangeToImmediate(*SubRegImm);
3767
3769 UseMI.setDesc(get(NewOpc));
3770
3771 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3772 if (DeleteDef)
3773 DefMI.eraseFromParent();
3774
3775 return true;
3776 }
3777
3778 // Added part is the constant: Use v_madak_{f16, f32}.
3779 if (Src2->isReg() && Src2->getReg() == Reg) {
3780 if (ST.getConstantBusLimit(Opc) < 2) {
3781 // Not allowed to use constant bus for another operand.
3782 // We can however allow an inline immediate as src0.
3783 bool Src0Inlined = false;
3784 if (Src0->isReg()) {
3785 // Try to inline constant if possible.
3786 // If the Def moves immediate and the use is single
3787 // We are saving VGPR here.
3788 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3789 if (Def && Def->isMoveImmediate() &&
3790 isInlineConstant(Def->getOperand(1)) &&
3791 MRI->hasOneUse(Src0->getReg())) {
3792 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3793 Src0Inlined = true;
3794 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3795 RI.isSGPRReg(*MRI, Src0->getReg())) {
3796 return false;
3797 }
3798 // VGPR is okay as Src0 - fallthrough
3799 }
3800
3801 if (Src1->isReg() && !Src0Inlined) {
3802 // We have one slot for inlinable constant so far - try to fill it
3803 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3804 if (Def && Def->isMoveImmediate() &&
3805 isInlineConstant(Def->getOperand(1)) &&
3806 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3807 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3808 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3809 return false;
3810 // VGPR is okay as Src1 - fallthrough
3811 }
3812 }
3813
3814 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3815 if (pseudoToMCOpcode(NewOpc) == -1)
3816 return false;
3817
3818 // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3819 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3820 // restricting their register classes. For now just bail out.
3821 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3822 NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3823 return false;
3824
3825 // FIXME: This would be a lot easier if we could return a new instruction
3826 // instead of having to modify in place.
3827
3828 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3829 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3830 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3831 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3832 UseMI.untieRegOperand(
3833 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3834
3835 const std::optional<int64_t> SubRegImm =
3836 extractSubregFromImm(Imm, Src2->getSubReg());
3837
3838 // ChangingToImmediate adds Src2 back to the instruction.
3839 Src2->ChangeToImmediate(*SubRegImm);
3840
3841 // These come before src2.
3843 UseMI.setDesc(get(NewOpc));
3844 // It might happen that UseMI was commuted
3845 // and we now have SGPR as SRC1. If so 2 inlined
3846 // constant and SGPR are illegal.
3848
3849 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3850 if (DeleteDef)
3851 DefMI.eraseFromParent();
3852
3853 return true;
3854 }
3855 }
3856
3857 return false;
3858}
3859
3860static bool
3863 if (BaseOps1.size() != BaseOps2.size())
3864 return false;
3865 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3866 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3867 return false;
3868 }
3869 return true;
3870}
3871
3872static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3873 LocationSize WidthB, int OffsetB) {
3874 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3875 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3876 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3877 return LowWidth.hasValue() &&
3878 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3879}
3880
3881bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3882 const MachineInstr &MIb) const {
3883 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3884 int64_t Offset0, Offset1;
3885 LocationSize Dummy0 = LocationSize::precise(0);
3886 LocationSize Dummy1 = LocationSize::precise(0);
3887 bool Offset0IsScalable, Offset1IsScalable;
3888 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3889 Dummy0, &RI) ||
3890 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3891 Dummy1, &RI))
3892 return false;
3893
3894 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3895 return false;
3896
3897 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3898 // FIXME: Handle ds_read2 / ds_write2.
3899 return false;
3900 }
3901 LocationSize Width0 = MIa.memoperands().front()->getSize();
3902 LocationSize Width1 = MIb.memoperands().front()->getSize();
3903 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3904}
3905
3907 const MachineInstr &MIb) const {
3908 assert(MIa.mayLoadOrStore() &&
3909 "MIa must load from or modify a memory location");
3910 assert(MIb.mayLoadOrStore() &&
3911 "MIb must load from or modify a memory location");
3912
3914 return false;
3915
3916 // XXX - Can we relax this between address spaces?
3917 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3918 return false;
3919
3920 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3921 return false;
3922
3923 // TODO: Should we check the address space from the MachineMemOperand? That
3924 // would allow us to distinguish objects we know don't alias based on the
3925 // underlying address space, even if it was lowered to a different one,
3926 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3927 // buffer.
3928 if (isDS(MIa)) {
3929 if (isDS(MIb))
3930 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3931
3932 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3933 }
3934
3935 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3936 if (isMUBUF(MIb) || isMTBUF(MIb))
3937 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3938
3939 if (isFLAT(MIb))
3940 return isFLATScratch(MIb);
3941
3942 return !isSMRD(MIb);
3943 }
3944
3945 if (isSMRD(MIa)) {
3946 if (isSMRD(MIb))
3947 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3948
3949 if (isFLAT(MIb))
3950 return isFLATScratch(MIb);
3951
3952 return !isMUBUF(MIb) && !isMTBUF(MIb);
3953 }
3954
3955 if (isFLAT(MIa)) {
3956 if (isFLAT(MIb)) {
3957 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3958 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3959 return true;
3960
3961 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3962 }
3963
3964 return false;
3965 }
3966
3967 return false;
3968}
3969
3971 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3972 if (Reg.isPhysical())
3973 return false;
3974 auto *Def = MRI.getUniqueVRegDef(Reg);
3975 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3976 Imm = Def->getOperand(1).getImm();
3977 if (DefMI)
3978 *DefMI = Def;
3979 return true;
3980 }
3981 return false;
3982}
3983
3984static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3985 MachineInstr **DefMI = nullptr) {
3986 if (!MO->isReg())
3987 return false;
3988 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3989 const MachineRegisterInfo &MRI = MF->getRegInfo();
3990 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3991}
3992
3994 MachineInstr &NewMI) {
3995 if (LV) {
3996 unsigned NumOps = MI.getNumOperands();
3997 for (unsigned I = 1; I < NumOps; ++I) {
3998 MachineOperand &Op = MI.getOperand(I);
3999 if (Op.isReg() && Op.isKill())
4000 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4001 }
4002 }
4003}
4004
4005static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4006 switch (Opc) {
4007 case AMDGPU::V_MAC_F16_e32:
4008 case AMDGPU::V_MAC_F16_e64:
4009 return AMDGPU::V_MAD_F16_e64;
4010 case AMDGPU::V_MAC_F32_e32:
4011 case AMDGPU::V_MAC_F32_e64:
4012 return AMDGPU::V_MAD_F32_e64;
4013 case AMDGPU::V_MAC_LEGACY_F32_e32:
4014 case AMDGPU::V_MAC_LEGACY_F32_e64:
4015 return AMDGPU::V_MAD_LEGACY_F32_e64;
4016 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4017 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4018 return AMDGPU::V_FMA_LEGACY_F32_e64;
4019 case AMDGPU::V_FMAC_F16_e32:
4020 case AMDGPU::V_FMAC_F16_e64:
4021 case AMDGPU::V_FMAC_F16_t16_e64:
4022 case AMDGPU::V_FMAC_F16_fake16_e64:
4023 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4024 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4025 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4026 : AMDGPU::V_FMA_F16_gfx9_e64;
4027 case AMDGPU::V_FMAC_F32_e32:
4028 case AMDGPU::V_FMAC_F32_e64:
4029 return AMDGPU::V_FMA_F32_e64;
4030 case AMDGPU::V_FMAC_F64_e32:
4031 case AMDGPU::V_FMAC_F64_e64:
4032 return AMDGPU::V_FMA_F64_e64;
4033 default:
4034 llvm_unreachable("invalid instruction");
4035 }
4036}
4037
4039 LiveVariables *LV,
4040 LiveIntervals *LIS) const {
4041 MachineBasicBlock &MBB = *MI.getParent();
4042 unsigned Opc = MI.getOpcode();
4043
4044 // Handle MFMA.
4045 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4046 if (NewMFMAOpc != -1) {
4048 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4049 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4050 MIB.add(MI.getOperand(I));
4051 updateLiveVariables(LV, MI, *MIB);
4052 if (LIS) {
4053 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4054 // SlotIndex of defs needs to be updated when converting to early-clobber
4055 MachineOperand &Def = MIB->getOperand(0);
4056 if (Def.isEarlyClobber() && Def.isReg() &&
4057 LIS->hasInterval(Def.getReg())) {
4058 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
4059 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
4060 auto &LI = LIS->getInterval(Def.getReg());
4061 auto UpdateDefIndex = [&](LiveRange &LR) {
4062 auto *S = LR.find(OldIndex);
4063 if (S != LR.end() && S->start == OldIndex) {
4064 assert(S->valno && S->valno->def == OldIndex);
4065 S->start = NewIndex;
4066 S->valno->def = NewIndex;
4067 }
4068 };
4069 UpdateDefIndex(LI);
4070 for (auto &SR : LI.subranges())
4071 UpdateDefIndex(SR);
4072 }
4073 }
4074 return MIB;
4075 }
4076
4077 if (SIInstrInfo::isWMMA(MI)) {
4078 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4079 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4080 .setMIFlags(MI.getFlags());
4081 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4082 MIB->addOperand(MI.getOperand(I));
4083
4084 updateLiveVariables(LV, MI, *MIB);
4085 if (LIS)
4086 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4087
4088 return MIB;
4089 }
4090
4091 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4092 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4093 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4094 "present pre-RA");
4095
4096 // Handle MAC/FMAC.
4097 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4098 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4099 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4100 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4101 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4102 bool Src0Literal = false;
4103
4104 switch (Opc) {
4105 default:
4106 return nullptr;
4107 case AMDGPU::V_MAC_F16_e64:
4108 case AMDGPU::V_FMAC_F16_e64:
4109 case AMDGPU::V_FMAC_F16_t16_e64:
4110 case AMDGPU::V_FMAC_F16_fake16_e64:
4111 case AMDGPU::V_MAC_F32_e64:
4112 case AMDGPU::V_MAC_LEGACY_F32_e64:
4113 case AMDGPU::V_FMAC_F32_e64:
4114 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4115 case AMDGPU::V_FMAC_F64_e64:
4116 break;
4117 case AMDGPU::V_MAC_F16_e32:
4118 case AMDGPU::V_FMAC_F16_e32:
4119 case AMDGPU::V_MAC_F32_e32:
4120 case AMDGPU::V_MAC_LEGACY_F32_e32:
4121 case AMDGPU::V_FMAC_F32_e32:
4122 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4123 case AMDGPU::V_FMAC_F64_e32: {
4124 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4125 AMDGPU::OpName::src0);
4126 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4127 if (!Src0->isReg() && !Src0->isImm())
4128 return nullptr;
4129
4130 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4131 Src0Literal = true;
4132
4133 break;
4134 }
4135 }
4136
4138 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4139 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4140 const MachineOperand *Src0Mods =
4141 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4142 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4143 const MachineOperand *Src1Mods =
4144 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4145 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4146 const MachineOperand *Src2Mods =
4147 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4148 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4149 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4150 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4151
4152 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4153 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4154 // If we have an SGPR input, we will violate the constant bus restriction.
4155 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4156 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4158 const auto killDef = [&]() -> void {
4159 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4160 // The only user is the instruction which will be killed.
4161 Register DefReg = DefMI->getOperand(0).getReg();
4162
4163 if (MRI.hasOneNonDBGUse(DefReg)) {
4164 // We cannot just remove the DefMI here, calling pass will crash.
4165 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
4166 DefMI->getOperand(0).setIsDead(true);
4167 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
4168 DefMI->removeOperand(I);
4169 if (LV)
4170 LV->getVarInfo(DefReg).AliveBlocks.clear();
4171 }
4172
4173 if (LIS) {
4174 LiveInterval &DefLI = LIS->getInterval(DefReg);
4175
4176 // We cannot delete the original instruction here, so hack out the use
4177 // in the original instruction with a dummy register so we can use
4178 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4179 // not have the complexity of deleting a use to consider here.
4180 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4181 for (MachineOperand &MIOp : MI.uses()) {
4182 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4183 MIOp.setIsUndef(true);
4184 MIOp.setReg(DummyReg);
4185 }
4186 }
4187
4188 LIS->shrinkToUses(&DefLI);
4189 }
4190 };
4191
4192 int64_t Imm;
4193 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4194 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4195 if (pseudoToMCOpcode(NewOpc) != -1) {
4196 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4197 .add(*Dst)
4198 .add(*Src0)
4199 .add(*Src1)
4200 .addImm(Imm)
4201 .setMIFlags(MI.getFlags());
4202 updateLiveVariables(LV, MI, *MIB);
4203 if (LIS)
4204 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4205 killDef();
4206 return MIB;
4207 }
4208 }
4209 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4210 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4211 if (pseudoToMCOpcode(NewOpc) != -1) {
4212 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4213 .add(*Dst)
4214 .add(*Src0)
4215 .addImm(Imm)
4216 .add(*Src2)
4217 .setMIFlags(MI.getFlags());
4218 updateLiveVariables(LV, MI, *MIB);
4219
4220 if (LIS)
4221 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4222 killDef();
4223 return MIB;
4224 }
4225 }
4226 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4227 if (Src0Literal) {
4228 Imm = Src0->getImm();
4229 DefMI = nullptr;
4230 }
4231 if (pseudoToMCOpcode(NewOpc) != -1 &&
4233 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4234 Src1)) {
4235 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4236 .add(*Dst)
4237 .add(*Src1)
4238 .addImm(Imm)
4239 .add(*Src2)
4240 .setMIFlags(MI.getFlags());
4241 updateLiveVariables(LV, MI, *MIB);
4242
4243 if (LIS)
4244 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4245 if (DefMI)
4246 killDef();
4247 return MIB;
4248 }
4249 }
4250 }
4251
4252 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4253 // if VOP3 does not allow a literal operand.
4254 if (Src0Literal && !ST.hasVOP3Literal())
4255 return nullptr;
4256
4257 unsigned NewOpc = getNewFMAInst(ST, Opc);
4258
4259 if (pseudoToMCOpcode(NewOpc) == -1)
4260 return nullptr;
4261
4262 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4263 .add(*Dst)
4264 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4265 .add(*Src0)
4266 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4267 .add(*Src1)
4268 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4269 .add(*Src2)
4270 .addImm(Clamp ? Clamp->getImm() : 0)
4271 .addImm(Omod ? Omod->getImm() : 0)
4272 .setMIFlags(MI.getFlags());
4273 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4274 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4275 updateLiveVariables(LV, MI, *MIB);
4276 if (LIS)
4277 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4278 return MIB;
4279}
4280
4281// It's not generally safe to move VALU instructions across these since it will
4282// start using the register as a base index rather than directly.
4283// XXX - Why isn't hasSideEffects sufficient for these?
4285 switch (MI.getOpcode()) {
4286 case AMDGPU::S_SET_GPR_IDX_ON:
4287 case AMDGPU::S_SET_GPR_IDX_MODE:
4288 case AMDGPU::S_SET_GPR_IDX_OFF:
4289 return true;
4290 default:
4291 return false;
4292 }
4293}
4294
4296 const MachineBasicBlock *MBB,
4297 const MachineFunction &MF) const {
4298 // Skipping the check for SP writes in the base implementation. The reason it
4299 // was added was apparently due to compile time concerns.
4300 //
4301 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4302 // but is probably avoidable.
4303
4304 // Copied from base implementation.
4305 // Terminators and labels can't be scheduled around.
4306 if (MI.isTerminator() || MI.isPosition())
4307 return true;
4308
4309 // INLINEASM_BR can jump to another block
4310 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4311 return true;
4312
4313 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4314 return true;
4315
4316 // Target-independent instructions do not have an implicit-use of EXEC, even
4317 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4318 // boundaries prevents incorrect movements of such instructions.
4319 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4320 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4321 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4322 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4323 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4325}
4326
4328 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4329 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4330 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4331}
4332
4334 if (!isFLAT(MI) || isFLATGlobal(MI))
4335 return false;
4336
4337 // If scratch is not initialized, we can never access it.
4338 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4339 return false;
4340
4341 // SCRATCH instructions always access scratch.
4342 if (isFLATScratch(MI))
4343 return true;
4344
4345 // If there are no memory operands then conservatively assume the flat
4346 // operation may access scratch.
4347 if (MI.memoperands_empty())
4348 return true;
4349
4350 // See if any memory operand specifies an address space that involves scratch.
4351 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4352 unsigned AS = Memop->getAddrSpace();
4353 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4354 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4355 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4356 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4357 }
4358 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4359 });
4360}
4361
4363 // Skip the full operand and register alias search modifiesRegister
4364 // does. There's only a handful of instructions that touch this, it's only an
4365 // implicit def, and doesn't alias any other registers.
4366 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4367}
4368
4370 unsigned Opcode = MI.getOpcode();
4371
4372 if (MI.mayStore() && isSMRD(MI))
4373 return true; // scalar store or atomic
4374
4375 // This will terminate the function when other lanes may need to continue.
4376 if (MI.isReturn())
4377 return true;
4378
4379 // These instructions cause shader I/O that may cause hardware lockups
4380 // when executed with an empty EXEC mask.
4381 //
4382 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4383 // EXEC = 0, but checking for that case here seems not worth it
4384 // given the typical code patterns.
4385 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4386 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4387 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4388 return true;
4389
4390 if (MI.isCall() || MI.isInlineAsm())
4391 return true; // conservative assumption
4392
4393 // Assume that barrier interactions are only intended with active lanes.
4394 if (isBarrier(Opcode))
4395 return true;
4396
4397 // A mode change is a scalar operation that influences vector instructions.
4399 return true;
4400
4401 // These are like SALU instructions in terms of effects, so it's questionable
4402 // whether we should return true for those.
4403 //
4404 // However, executing them with EXEC = 0 causes them to operate on undefined
4405 // data, which we avoid by returning true here.
4406 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4407 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4408 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4409 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4410 return true;
4411
4412 return false;
4413}
4414
4416 const MachineInstr &MI) const {
4417 if (MI.isMetaInstruction())
4418 return false;
4419
4420 // This won't read exec if this is an SGPR->SGPR copy.
4421 if (MI.isCopyLike()) {
4422 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4423 return true;
4424
4425 // Make sure this isn't copying exec as a normal operand
4426 return MI.readsRegister(AMDGPU::EXEC, &RI);
4427 }
4428
4429 // Make a conservative assumption about the callee.
4430 if (MI.isCall())
4431 return true;
4432
4433 // Be conservative with any unhandled generic opcodes.
4434 if (!isTargetSpecificOpcode(MI.getOpcode()))
4435 return true;
4436
4437 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4438}
4439
4440bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4441 switch (Imm.getBitWidth()) {
4442 case 1: // This likely will be a condition code mask.
4443 return true;
4444
4445 case 32:
4446 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4447 ST.hasInv2PiInlineImm());
4448 case 64:
4449 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4450 ST.hasInv2PiInlineImm());
4451 case 16:
4452 return ST.has16BitInsts() &&
4453 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4454 ST.hasInv2PiInlineImm());
4455 default:
4456 llvm_unreachable("invalid bitwidth");
4457 }
4458}
4459
4461 APInt IntImm = Imm.bitcastToAPInt();
4462 int64_t IntImmVal = IntImm.getSExtValue();
4463 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4464 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4465 default:
4466 llvm_unreachable("invalid fltSemantics");
4469 return isInlineConstant(IntImm);
4471 return ST.has16BitInsts() &&
4472 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4474 return ST.has16BitInsts() &&
4475 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4476 }
4477}
4478
4479bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4480 // MachineOperand provides no way to tell the true operand size, since it only
4481 // records a 64-bit value. We need to know the size to determine if a 32-bit
4482 // floating point immediate bit pattern is legal for an integer immediate. It
4483 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4484 switch (OperandType) {
4494 int32_t Trunc = static_cast<int32_t>(Imm);
4495 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4496 }
4502 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4505 // We would expect inline immediates to not be concerned with an integer/fp
4506 // distinction. However, in the case of 16-bit integer operations, the
4507 // "floating point" values appear to not work. It seems read the low 16-bits
4508 // of 32-bit immediates, which happens to always work for the integer
4509 // values.
4510 //
4511 // See llvm bugzilla 46302.
4512 //
4513 // TODO: Theoretically we could use op-sel to use the high bits of the
4514 // 32-bit FP values.
4526 return false;
4529 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4530 // A few special case instructions have 16-bit operands on subtargets
4531 // where 16-bit instructions are not legal.
4532 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4533 // constants in these cases
4534 int16_t Trunc = static_cast<int16_t>(Imm);
4535 return ST.has16BitInsts() &&
4536 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4537 }
4538
4539 return false;
4540 }
4543 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4544 int16_t Trunc = static_cast<int16_t>(Imm);
4545 return ST.has16BitInsts() &&
4546 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4547 }
4548 return false;
4549 }
4553 return false;
4555 return isLegalAV64PseudoImm(Imm);
4558 // Always embedded in the instruction for free.
4559 return true;
4569 // Just ignore anything else.
4570 return true;
4571 default:
4572 llvm_unreachable("invalid operand type");
4573 }
4574}
4575
4576static bool compareMachineOp(const MachineOperand &Op0,
4577 const MachineOperand &Op1) {
4578 if (Op0.getType() != Op1.getType())
4579 return false;
4580
4581 switch (Op0.getType()) {
4583 return Op0.getReg() == Op1.getReg();
4585 return Op0.getImm() == Op1.getImm();
4586 default:
4587 llvm_unreachable("Didn't expect to be comparing these operand types");
4588 }
4589}
4590
4592 const MCOperandInfo &OpInfo) const {
4593 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4594 return true;
4595
4596 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4597 return false;
4598
4599 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4600 return true;
4601
4602 return ST.hasVOP3Literal();
4603}
4604
4605bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4606 int64_t ImmVal) const {
4607 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4608 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4609 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4610 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4611 AMDGPU::OpName::src2))
4612 return false;
4613 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4614 }
4615
4616 return isLiteralOperandLegal(InstDesc, OpInfo);
4617}
4618
4619bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4620 const MachineOperand &MO) const {
4621 if (MO.isImm())
4622 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4623
4624 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4625 "unexpected imm-like operand kind");
4626 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4627 return isLiteralOperandLegal(InstDesc, OpInfo);
4628}
4629
4631 // 2 32-bit inline constants packed into one.
4632 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4633 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4634}
4635
4636bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4637 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4638 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4639 return false;
4640
4641 int Op32 = AMDGPU::getVOPe32(Opcode);
4642 if (Op32 == -1)
4643 return false;
4644
4645 return pseudoToMCOpcode(Op32) != -1;
4646}
4647
4648bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4649 // The src0_modifier operand is present on all instructions
4650 // that have modifiers.
4651
4652 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4653}
4654
4656 AMDGPU::OpName OpName) const {
4657 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4658 return Mods && Mods->getImm();
4659}
4660
4662 return any_of(ModifierOpNames,
4663 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4664}
4665
4667 const MachineRegisterInfo &MRI) const {
4668 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4669 // Can't shrink instruction with three operands.
4670 if (Src2) {
4671 switch (MI.getOpcode()) {
4672 default: return false;
4673
4674 case AMDGPU::V_ADDC_U32_e64:
4675 case AMDGPU::V_SUBB_U32_e64:
4676 case AMDGPU::V_SUBBREV_U32_e64: {
4677 const MachineOperand *Src1
4678 = getNamedOperand(MI, AMDGPU::OpName::src1);
4679 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4680 return false;
4681 // Additional verification is needed for sdst/src2.
4682 return true;
4683 }
4684 case AMDGPU::V_MAC_F16_e64:
4685 case AMDGPU::V_MAC_F32_e64:
4686 case AMDGPU::V_MAC_LEGACY_F32_e64:
4687 case AMDGPU::V_FMAC_F16_e64:
4688 case AMDGPU::V_FMAC_F16_t16_e64:
4689 case AMDGPU::V_FMAC_F16_fake16_e64:
4690 case AMDGPU::V_FMAC_F32_e64:
4691 case AMDGPU::V_FMAC_F64_e64:
4692 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4693 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4694 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4695 return false;
4696 break;
4697
4698 case AMDGPU::V_CNDMASK_B32_e64:
4699 break;
4700 }
4701 }
4702
4703 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4704 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4705 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4706 return false;
4707
4708 // We don't need to check src0, all input types are legal, so just make sure
4709 // src0 isn't using any modifiers.
4710 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4711 return false;
4712
4713 // Can it be shrunk to a valid 32 bit opcode?
4714 if (!hasVALU32BitEncoding(MI.getOpcode()))
4715 return false;
4716
4717 // Check output modifiers
4718 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4719 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4720 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4721 // TODO: Can we avoid checking bound_ctrl/fi here?
4722 // They are only used by permlane*_swap special case.
4723 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4724 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4725}
4726
4727// Set VCC operand with all flags from \p Orig, except for setting it as
4728// implicit.
4730 const MachineOperand &Orig) {
4731
4732 for (MachineOperand &Use : MI.implicit_operands()) {
4733 if (Use.isUse() &&
4734 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4735 Use.setIsUndef(Orig.isUndef());
4736 Use.setIsKill(Orig.isKill());
4737 return;
4738 }
4739 }
4740}
4741
4743 unsigned Op32) const {
4744 MachineBasicBlock *MBB = MI.getParent();
4745
4746 const MCInstrDesc &Op32Desc = get(Op32);
4747 MachineInstrBuilder Inst32 =
4748 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4749 .setMIFlags(MI.getFlags());
4750
4751 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4752 // For VOPC instructions, this is replaced by an implicit def of vcc.
4753
4754 // We assume the defs of the shrunk opcode are in the same order, and the
4755 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4756 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4757 Inst32.add(MI.getOperand(I));
4758
4759 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4760
4761 int Idx = MI.getNumExplicitDefs();
4762 for (const MachineOperand &Use : MI.explicit_uses()) {
4763 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4765 continue;
4766
4767 if (&Use == Src2) {
4768 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4769 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4770 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4771 // of vcc was already added during the initial BuildMI, but we
4772 // 1) may need to change vcc to vcc_lo to preserve the original register
4773 // 2) have to preserve the original flags.
4774 copyFlagsToImplicitVCC(*Inst32, *Src2);
4775 continue;
4776 }
4777 }
4778
4779 Inst32.add(Use);
4780 }
4781
4782 // FIXME: Losing implicit operands
4783 fixImplicitOperands(*Inst32);
4784 return Inst32;
4785}
4786
4788 // Null is free
4789 Register Reg = RegOp.getReg();
4790 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4791 return false;
4792
4793 // SGPRs use the constant bus
4794
4795 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
4796 // physical register operands should also count, except for exec.
4797 if (RegOp.isImplicit())
4798 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
4799
4800 // SGPRs use the constant bus
4801 return AMDGPU::SReg_32RegClass.contains(Reg) ||
4802 AMDGPU::SReg_64RegClass.contains(Reg);
4803}
4804
4806 const MachineRegisterInfo &MRI) const {
4807 Register Reg = RegOp.getReg();
4808 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4809 : physRegUsesConstantBus(RegOp);
4810}
4811
4813 const MachineOperand &MO,
4814 const MCOperandInfo &OpInfo) const {
4815 // Literal constants use the constant bus.
4816 if (!MO.isReg())
4817 return !isInlineConstant(MO, OpInfo);
4818
4819 Register Reg = MO.getReg();
4820 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4822}
4823
4825 for (const MachineOperand &MO : MI.implicit_operands()) {
4826 // We only care about reads.
4827 if (MO.isDef())
4828 continue;
4829
4830 switch (MO.getReg()) {
4831 case AMDGPU::VCC:
4832 case AMDGPU::VCC_LO:
4833 case AMDGPU::VCC_HI:
4834 case AMDGPU::M0:
4835 case AMDGPU::FLAT_SCR:
4836 return MO.getReg();
4837
4838 default:
4839 break;
4840 }
4841 }
4842
4843 return Register();
4844}
4845
4846static bool shouldReadExec(const MachineInstr &MI) {
4847 if (SIInstrInfo::isVALU(MI)) {
4848 switch (MI.getOpcode()) {
4849 case AMDGPU::V_READLANE_B32:
4850 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4851 case AMDGPU::V_WRITELANE_B32:
4852 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4853 return false;
4854 }
4855
4856 return true;
4857 }
4858
4859 if (MI.isPreISelOpcode() ||
4860 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4863 return false;
4864
4865 return true;
4866}
4867
4868static bool isRegOrFI(const MachineOperand &MO) {
4869 return MO.isReg() || MO.isFI();
4870}
4871
4872static bool isSubRegOf(const SIRegisterInfo &TRI,
4873 const MachineOperand &SuperVec,
4874 const MachineOperand &SubReg) {
4875 if (SubReg.getReg().isPhysical())
4876 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4877
4878 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4879 SubReg.getReg() == SuperVec.getReg();
4880}
4881
4882// Verify the illegal copy from vector register to SGPR for generic opcode COPY
4883bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4884 const MachineRegisterInfo &MRI,
4885 StringRef &ErrInfo) const {
4886 Register DstReg = MI.getOperand(0).getReg();
4887 Register SrcReg = MI.getOperand(1).getReg();
4888 // This is a check for copy from vector register to SGPR
4889 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
4890 ErrInfo = "illegal copy from vector register to SGPR";
4891 return false;
4892 }
4893 return true;
4894}
4895
4897 StringRef &ErrInfo) const {
4898 uint16_t Opcode = MI.getOpcode();
4899 const MachineFunction *MF = MI.getParent()->getParent();
4900 const MachineRegisterInfo &MRI = MF->getRegInfo();
4901
4902 // FIXME: At this point the COPY verify is done only for non-ssa forms.
4903 // Find a better property to recognize the point where instruction selection
4904 // is just done.
4905 // We can only enforce this check after SIFixSGPRCopies pass so that the
4906 // illegal copies are legalized and thereafter we don't expect a pass
4907 // inserting similar copies.
4908 if (!MRI.isSSA() && MI.isCopy())
4909 return verifyCopy(MI, MRI, ErrInfo);
4910
4911 if (SIInstrInfo::isGenericOpcode(Opcode))
4912 return true;
4913
4914 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4915 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4916 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4917 int Src3Idx = -1;
4918 if (Src0Idx == -1) {
4919 // VOPD V_DUAL_* instructions use different operand names.
4920 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4921 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4922 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4923 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4924 }
4925
4926 // Make sure the number of operands is correct.
4927 const MCInstrDesc &Desc = get(Opcode);
4928 if (!Desc.isVariadic() &&
4929 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4930 ErrInfo = "Instruction has wrong number of operands.";
4931 return false;
4932 }
4933
4934 if (MI.isInlineAsm()) {
4935 // Verify register classes for inlineasm constraints.
4936 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4937 I != E; ++I) {
4938 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4939 if (!RC)
4940 continue;
4941
4942 const MachineOperand &Op = MI.getOperand(I);
4943 if (!Op.isReg())
4944 continue;
4945
4946 Register Reg = Op.getReg();
4947 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4948 ErrInfo = "inlineasm operand has incorrect register class.";
4949 return false;
4950 }
4951 }
4952
4953 return true;
4954 }
4955
4956 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4957 ErrInfo = "missing memory operand from image instruction.";
4958 return false;
4959 }
4960
4961 // Make sure the register classes are correct.
4962 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4963 const MachineOperand &MO = MI.getOperand(i);
4964 if (MO.isFPImm()) {
4965 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4966 "all fp values to integers.";
4967 return false;
4968 }
4969
4970 int RegClass = Desc.operands()[i].RegClass;
4971
4972 const MCOperandInfo &OpInfo = Desc.operands()[i];
4973 switch (OpInfo.OperandType) {
4975 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4976 ErrInfo = "Illegal immediate value for operand.";
4977 return false;
4978 }
4979 break;
4992 break;
4994 break;
4995 break;
5009 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5010 ErrInfo = "Illegal immediate value for operand.";
5011 return false;
5012 }
5013 break;
5014 }
5016 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5017 ErrInfo = "Expected inline constant for operand.";
5018 return false;
5019 }
5020 break;
5024 break;
5029 // Check if this operand is an immediate.
5030 // FrameIndex operands will be replaced by immediates, so they are
5031 // allowed.
5032 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5033 ErrInfo = "Expected immediate, but got non-immediate";
5034 return false;
5035 }
5036 break;
5040 break;
5041 default:
5042 if (OpInfo.isGenericType())
5043 continue;
5044 break;
5045 }
5046
5047 if (!MO.isReg())
5048 continue;
5049 Register Reg = MO.getReg();
5050 if (!Reg)
5051 continue;
5052
5053 // FIXME: Ideally we would have separate instruction definitions with the
5054 // aligned register constraint.
5055 // FIXME: We do not verify inline asm operands, but custom inline asm
5056 // verification is broken anyway
5057 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
5058 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5059 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5060 if (const TargetRegisterClass *SubRC =
5061 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5062 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5063 if (RC)
5064 RC = SubRC;
5065 }
5066 }
5067
5068 // Check that this is the aligned version of the class.
5069 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5070 ErrInfo = "Subtarget requires even aligned vector registers";
5071 return false;
5072 }
5073 }
5074
5075 if (RegClass != -1) {
5076 if (Reg.isVirtual())
5077 continue;
5078
5079 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5080 if (!RC->contains(Reg)) {
5081 ErrInfo = "Operand has incorrect register class.";
5082 return false;
5083 }
5084 }
5085 }
5086
5087 // Verify SDWA
5088 if (isSDWA(MI)) {
5089 if (!ST.hasSDWA()) {
5090 ErrInfo = "SDWA is not supported on this target";
5091 return false;
5092 }
5093
5094 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5095 AMDGPU::OpName::dst_sel}) {
5096 const MachineOperand *MO = getNamedOperand(MI, Op);
5097 if (!MO)
5098 continue;
5099 int64_t Imm = MO->getImm();
5100 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5101 ErrInfo = "Invalid SDWA selection";
5102 return false;
5103 }
5104 }
5105
5106 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5107
5108 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5109 if (OpIdx == -1)
5110 continue;
5111 const MachineOperand &MO = MI.getOperand(OpIdx);
5112
5113 if (!ST.hasSDWAScalar()) {
5114 // Only VGPRS on VI
5115 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5116 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5117 return false;
5118 }
5119 } else {
5120 // No immediates on GFX9
5121 if (!MO.isReg()) {
5122 ErrInfo =
5123 "Only reg allowed as operands in SDWA instructions on GFX9+";
5124 return false;
5125 }
5126 }
5127 }
5128
5129 if (!ST.hasSDWAOmod()) {
5130 // No omod allowed on VI
5131 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5132 if (OMod != nullptr &&
5133 (!OMod->isImm() || OMod->getImm() != 0)) {
5134 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5135 return false;
5136 }
5137 }
5138
5139 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5140 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5141 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5142 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5143 const MachineOperand *Src0ModsMO =
5144 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5145 unsigned Mods = Src0ModsMO->getImm();
5146 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5147 Mods & SISrcMods::SEXT) {
5148 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5149 return false;
5150 }
5151 }
5152
5153 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5154 if (isVOPC(BasicOpcode)) {
5155 if (!ST.hasSDWASdst() && DstIdx != -1) {
5156 // Only vcc allowed as dst on VI for VOPC
5157 const MachineOperand &Dst = MI.getOperand(DstIdx);
5158 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5159 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5160 return false;
5161 }
5162 } else if (!ST.hasSDWAOutModsVOPC()) {
5163 // No clamp allowed on GFX9 for VOPC
5164 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5165 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5166 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5167 return false;
5168 }
5169
5170 // No omod allowed on GFX9 for VOPC
5171 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5172 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5173 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5174 return false;
5175 }
5176 }
5177 }
5178
5179 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5180 if (DstUnused && DstUnused->isImm() &&
5181 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5182 const MachineOperand &Dst = MI.getOperand(DstIdx);
5183 if (!Dst.isReg() || !Dst.isTied()) {
5184 ErrInfo = "Dst register should have tied register";
5185 return false;
5186 }
5187
5188 const MachineOperand &TiedMO =
5189 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5190 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5191 ErrInfo =
5192 "Dst register should be tied to implicit use of preserved register";
5193 return false;
5194 }
5195 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5196 ErrInfo = "Dst register should use same physical register as preserved";
5197 return false;
5198 }
5199 }
5200 }
5201
5202 // Verify MIMG / VIMAGE / VSAMPLE
5203 if (isImage(Opcode) && !MI.mayStore()) {
5204 // Ensure that the return type used is large enough for all the options
5205 // being used TFE/LWE require an extra result register.
5206 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5207 if (DMask) {
5208 uint64_t DMaskImm = DMask->getImm();
5209 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5210 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5211 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5212 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5213
5214 // Adjust for packed 16 bit values
5215 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5216 RegCount = divideCeil(RegCount, 2);
5217
5218 // Adjust if using LWE or TFE
5219 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5220 RegCount += 1;
5221
5222 const uint32_t DstIdx =
5223 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5224 const MachineOperand &Dst = MI.getOperand(DstIdx);
5225 if (Dst.isReg()) {
5226 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5227 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5228 if (RegCount > DstSize) {
5229 ErrInfo = "Image instruction returns too many registers for dst "
5230 "register class";
5231 return false;
5232 }
5233 }
5234 }
5235 }
5236
5237 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5238 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5239 unsigned ConstantBusCount = 0;
5240 bool UsesLiteral = false;
5241 const MachineOperand *LiteralVal = nullptr;
5242
5243 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5244 if (ImmIdx != -1) {
5245 ++ConstantBusCount;
5246 UsesLiteral = true;
5247 LiteralVal = &MI.getOperand(ImmIdx);
5248 }
5249
5250 SmallVector<Register, 2> SGPRsUsed;
5251 Register SGPRUsed;
5252
5253 // Only look at the true operands. Only a real operand can use the constant
5254 // bus, and we don't want to check pseudo-operands like the source modifier
5255 // flags.
5256 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5257 if (OpIdx == -1)
5258 continue;
5259 const MachineOperand &MO = MI.getOperand(OpIdx);
5260 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5261 if (MO.isReg()) {
5262 SGPRUsed = MO.getReg();
5263 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5264 ++ConstantBusCount;
5265 SGPRsUsed.push_back(SGPRUsed);
5266 }
5267 } else if (!MO.isFI()) { // Treat FI like a register.
5268 if (!UsesLiteral) {
5269 ++ConstantBusCount;
5270 UsesLiteral = true;
5271 LiteralVal = &MO;
5272 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5273 assert(isVOP2(MI) || isVOP3(MI));
5274 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5275 return false;
5276 }
5277 }
5278 }
5279 }
5280
5281 SGPRUsed = findImplicitSGPRRead(MI);
5282 if (SGPRUsed) {
5283 // Implicit uses may safely overlap true operands
5284 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5285 return !RI.regsOverlap(SGPRUsed, SGPR);
5286 })) {
5287 ++ConstantBusCount;
5288 SGPRsUsed.push_back(SGPRUsed);
5289 }
5290 }
5291
5292 // v_writelane_b32 is an exception from constant bus restriction:
5293 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5294 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5295 Opcode != AMDGPU::V_WRITELANE_B32) {
5296 ErrInfo = "VOP* instruction violates constant bus restriction";
5297 return false;
5298 }
5299
5300 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5301 ErrInfo = "VOP3 instruction uses literal";
5302 return false;
5303 }
5304 }
5305
5306 // Special case for writelane - this can break the multiple constant bus rule,
5307 // but still can't use more than one SGPR register
5308 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5309 unsigned SGPRCount = 0;
5310 Register SGPRUsed;
5311
5312 for (int OpIdx : {Src0Idx, Src1Idx}) {
5313 if (OpIdx == -1)
5314 break;
5315
5316 const MachineOperand &MO = MI.getOperand(OpIdx);
5317
5318 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5319 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5320 if (MO.getReg() != SGPRUsed)
5321 ++SGPRCount;
5322 SGPRUsed = MO.getReg();
5323 }
5324 }
5325 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5326 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5327 return false;
5328 }
5329 }
5330 }
5331
5332 // Verify misc. restrictions on specific instructions.
5333 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5334 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5335 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5336 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5337 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5338 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5339 if (!compareMachineOp(Src0, Src1) &&
5340 !compareMachineOp(Src0, Src2)) {
5341 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5342 return false;
5343 }
5344 }
5345 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5346 SISrcMods::ABS) ||
5347 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5348 SISrcMods::ABS) ||
5349 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5350 SISrcMods::ABS)) {
5351 ErrInfo = "ABS not allowed in VOP3B instructions";
5352 return false;
5353 }
5354 }
5355
5356 if (isSOP2(MI) || isSOPC(MI)) {
5357 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5358 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5359
5360 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5361 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5362 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5363 !Src0.isIdenticalTo(Src1)) {
5364 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5365 return false;
5366 }
5367 }
5368
5369 if (isSOPK(MI)) {
5370 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5371 if (Desc.isBranch()) {
5372 if (!Op->isMBB()) {
5373 ErrInfo = "invalid branch target for SOPK instruction";
5374 return false;
5375 }
5376 } else {
5377 uint64_t Imm = Op->getImm();
5378 if (sopkIsZext(Opcode)) {
5379 if (!isUInt<16>(Imm)) {
5380 ErrInfo = "invalid immediate for SOPK instruction";
5381 return false;
5382 }
5383 } else {
5384 if (!isInt<16>(Imm)) {
5385 ErrInfo = "invalid immediate for SOPK instruction";
5386 return false;
5387 }
5388 }
5389 }
5390 }
5391
5392 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5393 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5394 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5395 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5396 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5397 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5398
5399 const unsigned StaticNumOps =
5400 Desc.getNumOperands() + Desc.implicit_uses().size();
5401 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5402
5403 // Allow additional implicit operands. This allows a fixup done by the post
5404 // RA scheduler where the main implicit operand is killed and implicit-defs
5405 // are added for sub-registers that remain live after this instruction.
5406 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5407 ErrInfo = "missing implicit register operands";
5408 return false;
5409 }
5410
5411 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5412 if (IsDst) {
5413 if (!Dst->isUse()) {
5414 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5415 return false;
5416 }
5417
5418 unsigned UseOpIdx;
5419 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5420 UseOpIdx != StaticNumOps + 1) {
5421 ErrInfo = "movrel implicit operands should be tied";
5422 return false;
5423 }
5424 }
5425
5426 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5427 const MachineOperand &ImpUse
5428 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5429 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5430 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5431 ErrInfo = "src0 should be subreg of implicit vector use";
5432 return false;
5433 }
5434 }
5435
5436 // Make sure we aren't losing exec uses in the td files. This mostly requires
5437 // being careful when using let Uses to try to add other use registers.
5438 if (shouldReadExec(MI)) {
5439 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5440 ErrInfo = "VALU instruction does not implicitly read exec mask";
5441 return false;
5442 }
5443 }
5444
5445 if (isSMRD(MI)) {
5446 if (MI.mayStore() &&
5447 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5448 // The register offset form of scalar stores may only use m0 as the
5449 // soffset register.
5450 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5451 if (Soff && Soff->getReg() != AMDGPU::M0) {
5452 ErrInfo = "scalar stores must use m0 as offset register";
5453 return false;
5454 }
5455 }
5456 }
5457
5458 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5459 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5460 if (Offset->getImm() != 0) {
5461 ErrInfo = "subtarget does not support offsets in flat instructions";
5462 return false;
5463 }
5464 }
5465
5466 if (isDS(MI) && !ST.hasGDS()) {
5467 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5468 if (GDSOp && GDSOp->getImm() != 0) {
5469 ErrInfo = "GDS is not supported on this subtarget";
5470 return false;
5471 }
5472 }
5473
5474 if (isImage(MI)) {
5475 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5476 if (DimOp) {
5477 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5478 AMDGPU::OpName::vaddr0);
5479 AMDGPU::OpName RSrcOpName =
5480 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5481 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5482 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5483 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5484 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5485 const AMDGPU::MIMGDimInfo *Dim =
5487
5488 if (!Dim) {
5489 ErrInfo = "dim is out of range";
5490 return false;
5491 }
5492
5493 bool IsA16 = false;
5494 if (ST.hasR128A16()) {
5495 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5496 IsA16 = R128A16->getImm() != 0;
5497 } else if (ST.hasA16()) {
5498 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5499 IsA16 = A16->getImm() != 0;
5500 }
5501
5502 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5503
5504 unsigned AddrWords =
5505 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5506
5507 unsigned VAddrWords;
5508 if (IsNSA) {
5509 VAddrWords = RsrcIdx - VAddr0Idx;
5510 if (ST.hasPartialNSAEncoding() &&
5511 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5512 unsigned LastVAddrIdx = RsrcIdx - 1;
5513 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5514 }
5515 } else {
5516 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5517 if (AddrWords > 12)
5518 AddrWords = 16;
5519 }
5520
5521 if (VAddrWords != AddrWords) {
5522 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5523 << " but got " << VAddrWords << "\n");
5524 ErrInfo = "bad vaddr size";
5525 return false;
5526 }
5527 }
5528 }
5529
5530 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5531 if (DppCt) {
5532 using namespace AMDGPU::DPP;
5533
5534 unsigned DC = DppCt->getImm();
5535 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5536 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5537 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5538 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5539 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5540 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5541 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5542 ErrInfo = "Invalid dpp_ctrl value";
5543 return false;
5544 }
5545 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5546 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5547 ErrInfo = "Invalid dpp_ctrl value: "
5548 "wavefront shifts are not supported on GFX10+";
5549 return false;
5550 }
5551 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5552 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5553 ErrInfo = "Invalid dpp_ctrl value: "
5554 "broadcasts are not supported on GFX10+";
5555 return false;
5556 }
5557 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5558 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5559 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5560 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5561 !ST.hasGFX90AInsts()) {
5562 ErrInfo = "Invalid dpp_ctrl value: "
5563 "row_newbroadcast/row_share is not supported before "
5564 "GFX90A/GFX10";
5565 return false;
5566 }
5567 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5568 ErrInfo = "Invalid dpp_ctrl value: "
5569 "row_share and row_xmask are not supported before GFX10";
5570 return false;
5571 }
5572 }
5573
5574 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5577 ErrInfo = "Invalid dpp_ctrl value: "
5578 "DP ALU dpp only support row_newbcast";
5579 return false;
5580 }
5581 }
5582
5583 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5584 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5585 AMDGPU::OpName DataName =
5586 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5587 const MachineOperand *Data = getNamedOperand(MI, DataName);
5588 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5589 if (Data && !Data->isReg())
5590 Data = nullptr;
5591
5592 if (ST.hasGFX90AInsts()) {
5593 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5594 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5595 ErrInfo = "Invalid register class: "
5596 "vdata and vdst should be both VGPR or AGPR";
5597 return false;
5598 }
5599 if (Data && Data2 &&
5600 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5601 ErrInfo = "Invalid register class: "
5602 "both data operands should be VGPR or AGPR";
5603 return false;
5604 }
5605 } else {
5606 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5607 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5608 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5609 ErrInfo = "Invalid register class: "
5610 "agpr loads and stores not supported on this GPU";
5611 return false;
5612 }
5613 }
5614 }
5615
5616 if (ST.needsAlignedVGPRs()) {
5617 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5619 if (!Op)
5620 return true;
5621 Register Reg = Op->getReg();
5622 if (Reg.isPhysical())
5623 return !(RI.getHWRegIndex(Reg) & 1);
5624 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5625 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5626 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5627 };
5628
5629 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5630 Opcode == AMDGPU::DS_GWS_BARRIER) {
5631
5632 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5633 ErrInfo = "Subtarget requires even aligned vector registers "
5634 "for DS_GWS instructions";
5635 return false;
5636 }
5637 }
5638
5639 if (isMIMG(MI)) {
5640 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5641 ErrInfo = "Subtarget requires even aligned vector registers "
5642 "for vaddr operand of image instructions";
5643 return false;
5644 }
5645 }
5646 }
5647
5648 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5649 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5650 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5651 ErrInfo = "Invalid register class: "
5652 "v_accvgpr_write with an SGPR is not supported on this GPU";
5653 return false;
5654 }
5655 }
5656
5657 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5658 const MachineOperand &SrcOp = MI.getOperand(1);
5659 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5660 ErrInfo = "pseudo expects only physical SGPRs";
5661 return false;
5662 }
5663 }
5664
5665 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5666 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5667 if (!ST.hasScaleOffset()) {
5668 ErrInfo = "Subtarget does not support offset scaling";
5669 return false;
5670 }
5671 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5672 ErrInfo = "Instruction does not support offset scaling";
5673 return false;
5674 }
5675 }
5676 }
5677
5678 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5679 // information.
5680 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5681 for (unsigned I = 0; I < 3; ++I) {
5683 return false;
5684 }
5685 }
5686
5687 return true;
5688}
5689
5690// It is more readable to list mapped opcodes on the same line.
5691// clang-format off
5692
5694 switch (MI.getOpcode()) {
5695 default: return AMDGPU::INSTRUCTION_LIST_END;
5696 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5697 case AMDGPU::COPY: return AMDGPU::COPY;
5698 case AMDGPU::PHI: return AMDGPU::PHI;
5699 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5700 case AMDGPU::WQM: return AMDGPU::WQM;
5701 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5702 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5703 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5704 case AMDGPU::S_MOV_B32: {
5705 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5706 return MI.getOperand(1).isReg() ||
5707 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5708 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5709 }
5710 case AMDGPU::S_ADD_I32:
5711 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5712 case AMDGPU::S_ADDC_U32:
5713 return AMDGPU::V_ADDC_U32_e32;
5714 case AMDGPU::S_SUB_I32:
5715 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5716 // FIXME: These are not consistently handled, and selected when the carry is
5717 // used.
5718 case AMDGPU::S_ADD_U32:
5719 return AMDGPU::V_ADD_CO_U32_e32;
5720 case AMDGPU::S_SUB_U32:
5721 return AMDGPU::V_SUB_CO_U32_e32;
5722 case AMDGPU::S_ADD_U64_PSEUDO:
5723 return AMDGPU::V_ADD_U64_PSEUDO;
5724 case AMDGPU::S_SUB_U64_PSEUDO:
5725 return AMDGPU::V_SUB_U64_PSEUDO;
5726 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5727 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5728 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5729 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5730 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5731 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5732 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5733 case AMDGPU::S_XNOR_B32:
5734 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5735 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5736 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5737 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5738 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5739 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5740 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5741 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5742 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5743 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5744 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5745 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5746 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5747 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5748 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5749 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5750 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5751 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5752 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5753 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5754 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5755 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5756 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5757 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5758 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5759 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5760 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5761 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5762 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5763 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5764 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5765 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5766 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5767 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5768 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5769 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5770 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5771 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5772 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5773 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5774 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5775 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5776 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5777 case AMDGPU::S_CVT_F32_F16:
5778 case AMDGPU::S_CVT_HI_F32_F16:
5779 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5780 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5781 case AMDGPU::S_CVT_F16_F32:
5782 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5783 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5784 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5785 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5786 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5787 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5788 case AMDGPU::S_CEIL_F16:
5789 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5790 : AMDGPU::V_CEIL_F16_fake16_e64;
5791 case AMDGPU::S_FLOOR_F16:
5792 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5793 : AMDGPU::V_FLOOR_F16_fake16_e64;
5794 case AMDGPU::S_TRUNC_F16:
5795 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
5796 : AMDGPU::V_TRUNC_F16_fake16_e64;
5797 case AMDGPU::S_RNDNE_F16:
5798 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
5799 : AMDGPU::V_RNDNE_F16_fake16_e64;
5800 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5801 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5802 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5803 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5804 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5805 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5806 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5807 case AMDGPU::S_ADD_F16:
5808 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
5809 : AMDGPU::V_ADD_F16_fake16_e64;
5810 case AMDGPU::S_SUB_F16:
5811 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
5812 : AMDGPU::V_SUB_F16_fake16_e64;
5813 case AMDGPU::S_MIN_F16:
5814 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
5815 : AMDGPU::V_MIN_F16_fake16_e64;
5816 case AMDGPU::S_MAX_F16:
5817 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
5818 : AMDGPU::V_MAX_F16_fake16_e64;
5819 case AMDGPU::S_MINIMUM_F16:
5820 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5821 : AMDGPU::V_MINIMUM_F16_fake16_e64;
5822 case AMDGPU::S_MAXIMUM_F16:
5823 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5824 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
5825 case AMDGPU::S_MUL_F16:
5826 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
5827 : AMDGPU::V_MUL_F16_fake16_e64;
5828 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5829 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5830 case AMDGPU::S_FMAC_F16:
5831 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5832 : AMDGPU::V_FMAC_F16_fake16_e64;
5833 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5834 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5835 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5836 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5837 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5838 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5839 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5840 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5841 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5842 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5843 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5844 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5845 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5846 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5847 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5848 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5849 case AMDGPU::S_CMP_LT_F16:
5850 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5851 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5852 case AMDGPU::S_CMP_EQ_F16:
5853 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5854 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5855 case AMDGPU::S_CMP_LE_F16:
5856 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5857 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5858 case AMDGPU::S_CMP_GT_F16:
5859 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5860 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5861 case AMDGPU::S_CMP_LG_F16:
5862 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5863 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5864 case AMDGPU::S_CMP_GE_F16:
5865 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5866 : AMDGPU::V_CMP_GE_F16_fake16_e64;
5867 case AMDGPU::S_CMP_O_F16:
5868 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5869 : AMDGPU::V_CMP_O_F16_fake16_e64;
5870 case AMDGPU::S_CMP_U_F16:
5871 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5872 : AMDGPU::V_CMP_U_F16_fake16_e64;
5873 case AMDGPU::S_CMP_NGE_F16:
5874 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5875 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5876 case AMDGPU::S_CMP_NLG_F16:
5877 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5878 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5879 case AMDGPU::S_CMP_NGT_F16:
5880 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5881 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
5882 case AMDGPU::S_CMP_NLE_F16:
5883 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5884 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
5885 case AMDGPU::S_CMP_NEQ_F16:
5886 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5887 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
5888 case AMDGPU::S_CMP_NLT_F16:
5889 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5890 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
5891 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5892 case AMDGPU::V_S_EXP_F16_e64:
5893 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
5894 : AMDGPU::V_EXP_F16_fake16_e64;
5895 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5896 case AMDGPU::V_S_LOG_F16_e64:
5897 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
5898 : AMDGPU::V_LOG_F16_fake16_e64;
5899 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5900 case AMDGPU::V_S_RCP_F16_e64:
5901 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
5902 : AMDGPU::V_RCP_F16_fake16_e64;
5903 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5904 case AMDGPU::V_S_RSQ_F16_e64:
5905 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
5906 : AMDGPU::V_RSQ_F16_fake16_e64;
5907 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5908 case AMDGPU::V_S_SQRT_F16_e64:
5909 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
5910 : AMDGPU::V_SQRT_F16_fake16_e64;
5911 }
5913 "Unexpected scalar opcode without corresponding vector one!");
5914}
5915
5916// clang-format on
5917
5921 const DebugLoc &DL, Register Reg,
5922 bool IsSCCLive,
5923 SlotIndexes *Indexes) const {
5924 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5925 const SIInstrInfo *TII = ST.getInstrInfo();
5926 bool IsWave32 = ST.isWave32();
5927 if (IsSCCLive) {
5928 // Insert two move instructions, one to save the original value of EXEC and
5929 // the other to turn on all bits in EXEC. This is required as we can't use
5930 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5931 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5932 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5933 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5934 .addReg(Exec, RegState::Kill);
5935 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5936 if (Indexes) {
5937 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5938 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5939 }
5940 } else {
5941 const unsigned OrSaveExec =
5942 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5943 auto SaveExec =
5944 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5945 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5946 if (Indexes)
5947 Indexes->insertMachineInstrInMaps(*SaveExec);
5948 }
5949}
5950
5953 const DebugLoc &DL, Register Reg,
5954 SlotIndexes *Indexes) const {
5955 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5956 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5957 auto ExecRestoreMI =
5958 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5959 if (Indexes)
5960 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5961}
5962
5966 "Not a whole wave func");
5967 MachineBasicBlock &MBB = *MF.begin();
5968 for (MachineInstr &MI : MBB)
5969 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
5970 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
5971 return &MI;
5972
5973 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
5974}
5975
5976static const TargetRegisterClass *
5978 const MCInstrDesc &TID, unsigned RCID) {
5979 if (!ST.hasGFX90AInsts() && (TID.mayLoad() || TID.mayStore())) {
5980 switch (RCID) {
5981 case AMDGPU::AV_32RegClassID:
5982 RCID = AMDGPU::VGPR_32RegClassID;
5983 break;
5984 case AMDGPU::AV_64RegClassID:
5985 RCID = AMDGPU::VReg_64RegClassID;
5986 break;
5987 case AMDGPU::AV_96RegClassID:
5988 RCID = AMDGPU::VReg_96RegClassID;
5989 break;
5990 case AMDGPU::AV_128RegClassID:
5991 RCID = AMDGPU::VReg_128RegClassID;
5992 break;
5993 case AMDGPU::AV_160RegClassID:
5994 RCID = AMDGPU::VReg_160RegClassID;
5995 break;
5996 case AMDGPU::AV_512RegClassID:
5997 RCID = AMDGPU::VReg_512RegClassID;
5998 break;
5999 default:
6000 break;
6001 }
6002 }
6003
6004 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
6005}
6006
6007const TargetRegisterClass *
6008SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum,
6009 const TargetRegisterInfo *TRI) const {
6010 if (OpNum >= TID.getNumOperands())
6011 return nullptr;
6012 auto RegClass = TID.operands()[OpNum].RegClass;
6013 // Special pseudos have no alignment requirement.
6014 if (TID.getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO || isSpill(TID))
6015 return RI.getRegClass(RegClass);
6016
6017 return adjustAllocatableRegClass(ST, RI, TID, RegClass);
6018}
6019
6021 unsigned OpNo) const {
6022 const MCInstrDesc &Desc = get(MI.getOpcode());
6023 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6024 Desc.operands()[OpNo].RegClass == -1) {
6025 Register Reg = MI.getOperand(OpNo).getReg();
6026
6027 if (Reg.isVirtual()) {
6028 const MachineRegisterInfo &MRI =
6029 MI.getParent()->getParent()->getRegInfo();
6030 return MRI.getRegClass(Reg);
6031 }
6032 return RI.getPhysRegBaseClass(Reg);
6033 }
6034
6035 unsigned RCID = Desc.operands()[OpNo].RegClass;
6036 return adjustAllocatableRegClass(ST, RI, Desc, RCID);
6037}
6038
6041 MachineBasicBlock *MBB = MI.getParent();
6042 MachineOperand &MO = MI.getOperand(OpIdx);
6043 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6044 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
6045 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6046 unsigned Size = RI.getRegSizeInBits(*RC);
6047 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6048 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6049 : AMDGPU::V_MOV_B32_e32;
6050 if (MO.isReg())
6051 Opcode = AMDGPU::COPY;
6052 else if (RI.isSGPRClass(RC))
6053 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6054
6055 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6056 Register Reg = MRI.createVirtualRegister(VRC);
6057 DebugLoc DL = MBB->findDebugLoc(I);
6058 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6059 MO.ChangeToRegister(Reg, false);
6060}
6061
6064 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6065 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6066 if (!SuperReg.getReg().isVirtual())
6067 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6068
6069 MachineBasicBlock *MBB = MI->getParent();
6070 const DebugLoc &DL = MI->getDebugLoc();
6071 Register SubReg = MRI.createVirtualRegister(SubRC);
6072
6073 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6074 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6075 .addReg(SuperReg.getReg(), 0, NewSubIdx);
6076 return SubReg;
6077}
6078
6081 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6082 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6083 if (Op.isImm()) {
6084 if (SubIdx == AMDGPU::sub0)
6085 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6086 if (SubIdx == AMDGPU::sub1)
6087 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6088
6089 llvm_unreachable("Unhandled register index for immediate");
6090 }
6091
6092 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6093 SubIdx, SubRC);
6094 return MachineOperand::CreateReg(SubReg, false);
6095}
6096
6097// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6098void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6099 assert(Inst.getNumExplicitOperands() == 3);
6100 MachineOperand Op1 = Inst.getOperand(1);
6101 Inst.removeOperand(1);
6102 Inst.addOperand(Op1);
6103}
6104
6106 const MCOperandInfo &OpInfo,
6107 const MachineOperand &MO) const {
6108 if (!MO.isReg())
6109 return false;
6110
6111 Register Reg = MO.getReg();
6112
6113 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
6114 if (Reg.isPhysical())
6115 return DRC->contains(Reg);
6116
6117 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6118
6119 if (MO.getSubReg()) {
6120 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
6121 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6122 if (!SuperRC)
6123 return false;
6124 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6125 }
6126
6127 return RI.getCommonSubClass(DRC, RC) != nullptr;
6128}
6129
6131 const MachineOperand &MO) const {
6132 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6133 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6134 unsigned Opc = MI.getOpcode();
6135
6136 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6137 // information.
6138 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6139 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6140 constexpr const AMDGPU::OpName OpNames[] = {
6141 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6142
6143 for (auto [I, OpName] : enumerate(OpNames)) {
6144 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6145 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6147 return false;
6148 }
6149 }
6150
6151 if (!isLegalRegOperand(MRI, OpInfo, MO))
6152 return false;
6153
6154 // check Accumulate GPR operand
6155 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6156 if (IsAGPR && !ST.hasMAIInsts())
6157 return false;
6158 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6159 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6160 return false;
6161 // Atomics should have both vdst and vdata either vgpr or agpr.
6162 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6163 const int DataIdx = AMDGPU::getNamedOperandIdx(
6164 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6165 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6166 MI.getOperand(DataIdx).isReg() &&
6167 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6168 return false;
6169 if ((int)OpIdx == DataIdx) {
6170 if (VDstIdx != -1 &&
6171 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6172 return false;
6173 // DS instructions with 2 src operands also must have tied RC.
6174 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6175 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6176 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6177 return false;
6178 }
6179
6180 // Check V_ACCVGPR_WRITE_B32_e64
6181 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6182 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6183 RI.isSGPRReg(MRI, MO.getReg()))
6184 return false;
6185 return true;
6186}
6187
6189 const MCOperandInfo &OpInfo,
6190 const MachineOperand &MO) const {
6191 if (MO.isReg())
6192 return isLegalRegOperand(MRI, OpInfo, MO);
6193
6194 // Handle non-register types that are treated like immediates.
6195 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6196 return true;
6197}
6198
6200 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6201 const MachineOperand *MO) const {
6202 constexpr const unsigned NumOps = 3;
6203 constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
6204 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6205 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6206 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6207
6208 assert(SrcN < NumOps);
6209
6210 if (!MO) {
6211 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6212 if (SrcIdx == -1)
6213 return true;
6214 MO = &MI.getOperand(SrcIdx);
6215 }
6216
6217 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6218 return true;
6219
6220 int ModsIdx =
6221 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6222 if (ModsIdx == -1)
6223 return true;
6224
6225 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6226 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6227 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6228
6229 return !OpSel && !OpSelHi;
6230}
6231
6233 const MachineOperand *MO) const {
6234 const MachineFunction &MF = *MI.getParent()->getParent();
6235 const MachineRegisterInfo &MRI = MF.getRegInfo();
6236 const MCInstrDesc &InstDesc = MI.getDesc();
6237 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6238 const TargetRegisterClass *DefinedRC =
6239 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
6240 if (!MO)
6241 MO = &MI.getOperand(OpIdx);
6242
6243 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6244
6245 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6246 const MachineOperand *UsedLiteral = nullptr;
6247
6248 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6249 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6250
6251 // TODO: Be more permissive with frame indexes.
6252 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6253 if (!LiteralLimit--)
6254 return false;
6255
6256 UsedLiteral = MO;
6257 }
6258
6260 if (MO->isReg())
6261 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6262
6263 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6264 if (i == OpIdx)
6265 continue;
6266 const MachineOperand &Op = MI.getOperand(i);
6267 if (Op.isReg()) {
6268 if (Op.isUse()) {
6269 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6270 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6271 if (--ConstantBusLimit <= 0)
6272 return false;
6273 }
6274 }
6275 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6276 !isInlineConstant(Op, InstDesc.operands()[i])) {
6277 // The same literal may be used multiple times.
6278 if (!UsedLiteral)
6279 UsedLiteral = &Op;
6280 else if (UsedLiteral->isIdenticalTo(Op))
6281 continue;
6282
6283 if (!LiteralLimit--)
6284 return false;
6285 if (--ConstantBusLimit <= 0)
6286 return false;
6287 }
6288 }
6289 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6290 // There can be at most one literal operand, but it can be repeated.
6291 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6292 if (i == OpIdx)
6293 continue;
6294 const MachineOperand &Op = MI.getOperand(i);
6295 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6296 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6297 !Op.isIdenticalTo(*MO))
6298 return false;
6299
6300 // Do not fold a non-inlineable and non-register operand into an
6301 // instruction that already has a frame index. The frame index handling
6302 // code could not handle well when a frame index co-exists with another
6303 // non-register operand, unless that operand is an inlineable immediate.
6304 if (Op.isFI())
6305 return false;
6306 }
6307 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6308 isF16PseudoScalarTrans(MI.getOpcode())) {
6309 return false;
6310 }
6311
6312 if (MO->isReg()) {
6313 if (!DefinedRC)
6314 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6315 return isLegalRegOperand(MI, OpIdx, *MO);
6316 }
6317
6318 if (MO->isImm()) {
6319 uint64_t Imm = MO->getImm();
6320 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6321 bool Is64BitOp = Is64BitFPOp ||
6322 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6323 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6324 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6325 if (Is64BitOp &&
6326 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6327 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6328 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6329 return false;
6330
6331 // FIXME: We can use sign extended 64-bit literals, but only for signed
6332 // operands. At the moment we do not know if an operand is signed.
6333 // Such operand will be encoded as its low 32 bits and then either
6334 // correctly sign extended or incorrectly zero extended by HW.
6335 // If 64-bit literals are supported and the literal will be encoded
6336 // as full 64 bit we still can use it.
6337 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6338 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6339 return false;
6340 }
6341 }
6342
6343 // Handle non-register types that are treated like immediates.
6344 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6345
6346 if (!DefinedRC) {
6347 // This operand expects an immediate.
6348 return true;
6349 }
6350
6351 return isImmOperandLegal(MI, OpIdx, *MO);
6352}
6353
6355 MachineInstr &MI) const {
6356 unsigned Opc = MI.getOpcode();
6357 const MCInstrDesc &InstrDesc = get(Opc);
6358
6359 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6360 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6361
6362 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6363 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6364
6365 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6366 // we need to only have one constant bus use before GFX10.
6367 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6368 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6369 RI.isSGPRReg(MRI, Src0.getReg()))
6370 legalizeOpWithMove(MI, Src0Idx);
6371
6372 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6373 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6374 // src0/src1 with V_READFIRSTLANE.
6375 if (Opc == AMDGPU::V_WRITELANE_B32) {
6376 const DebugLoc &DL = MI.getDebugLoc();
6377 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6378 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6379 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6380 .add(Src0);
6381 Src0.ChangeToRegister(Reg, false);
6382 }
6383 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6384 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6385 const DebugLoc &DL = MI.getDebugLoc();
6386 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6387 .add(Src1);
6388 Src1.ChangeToRegister(Reg, false);
6389 }
6390 return;
6391 }
6392
6393 // No VOP2 instructions support AGPRs.
6394 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
6395 legalizeOpWithMove(MI, Src0Idx);
6396
6397 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
6398 legalizeOpWithMove(MI, Src1Idx);
6399
6400 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6401 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6402 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6403 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6404 legalizeOpWithMove(MI, Src2Idx);
6405 }
6406
6407 // VOP2 src0 instructions support all operand types, so we don't need to check
6408 // their legality. If src1 is already legal, we don't need to do anything.
6409 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6410 return;
6411
6412 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6413 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6414 // select is uniform.
6415 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6416 RI.isVGPR(MRI, Src1.getReg())) {
6417 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6418 const DebugLoc &DL = MI.getDebugLoc();
6419 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6420 .add(Src1);
6421 Src1.ChangeToRegister(Reg, false);
6422 return;
6423 }
6424
6425 // We do not use commuteInstruction here because it is too aggressive and will
6426 // commute if it is possible. We only want to commute here if it improves
6427 // legality. This can be called a fairly large number of times so don't waste
6428 // compile time pointlessly swapping and checking legality again.
6429 if (HasImplicitSGPR || !MI.isCommutable()) {
6430 legalizeOpWithMove(MI, Src1Idx);
6431 return;
6432 }
6433
6434 // If src0 can be used as src1, commuting will make the operands legal.
6435 // Otherwise we have to give up and insert a move.
6436 //
6437 // TODO: Other immediate-like operand kinds could be commuted if there was a
6438 // MachineOperand::ChangeTo* for them.
6439 if ((!Src1.isImm() && !Src1.isReg()) ||
6440 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6441 legalizeOpWithMove(MI, Src1Idx);
6442 return;
6443 }
6444
6445 int CommutedOpc = commuteOpcode(MI);
6446 if (CommutedOpc == -1) {
6447 legalizeOpWithMove(MI, Src1Idx);
6448 return;
6449 }
6450
6451 MI.setDesc(get(CommutedOpc));
6452
6453 Register Src0Reg = Src0.getReg();
6454 unsigned Src0SubReg = Src0.getSubReg();
6455 bool Src0Kill = Src0.isKill();
6456
6457 if (Src1.isImm())
6458 Src0.ChangeToImmediate(Src1.getImm());
6459 else if (Src1.isReg()) {
6460 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6461 Src0.setSubReg(Src1.getSubReg());
6462 } else
6463 llvm_unreachable("Should only have register or immediate operands");
6464
6465 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6466 Src1.setSubReg(Src0SubReg);
6468}
6469
6470// Legalize VOP3 operands. All operand types are supported for any operand
6471// but only one literal constant and only starting from GFX10.
6473 MachineInstr &MI) const {
6474 unsigned Opc = MI.getOpcode();
6475
6476 int VOP3Idx[3] = {
6477 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6478 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6479 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6480 };
6481
6482 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6483 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6484 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6485 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6486 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6487 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6488 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6489 // src1 and src2 must be scalar
6490 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6491 const DebugLoc &DL = MI.getDebugLoc();
6492 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6493 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6494 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6495 .add(Src1);
6496 Src1.ChangeToRegister(Reg, false);
6497 }
6498 if (VOP3Idx[2] != -1) {
6499 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6500 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6501 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6502 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6503 .add(Src2);
6504 Src2.ChangeToRegister(Reg, false);
6505 }
6506 }
6507 }
6508
6509 // Find the one SGPR operand we are allowed to use.
6510 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6511 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6512 SmallDenseSet<unsigned> SGPRsUsed;
6513 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6514 if (SGPRReg) {
6515 SGPRsUsed.insert(SGPRReg);
6516 --ConstantBusLimit;
6517 }
6518
6519 for (int Idx : VOP3Idx) {
6520 if (Idx == -1)
6521 break;
6522 MachineOperand &MO = MI.getOperand(Idx);
6523
6524 if (!MO.isReg()) {
6525 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6526 continue;
6527
6528 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6529 --LiteralLimit;
6530 --ConstantBusLimit;
6531 continue;
6532 }
6533
6534 --LiteralLimit;
6535 --ConstantBusLimit;
6536 legalizeOpWithMove(MI, Idx);
6537 continue;
6538 }
6539
6540 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
6541 !isOperandLegal(MI, Idx, &MO)) {
6542 legalizeOpWithMove(MI, Idx);
6543 continue;
6544 }
6545
6546 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6547 continue; // VGPRs are legal
6548
6549 // We can use one SGPR in each VOP3 instruction prior to GFX10
6550 // and two starting from GFX10.
6551 if (SGPRsUsed.count(MO.getReg()))
6552 continue;
6553 if (ConstantBusLimit > 0) {
6554 SGPRsUsed.insert(MO.getReg());
6555 --ConstantBusLimit;
6556 continue;
6557 }
6558
6559 // If we make it this far, then the operand is not legal and we must
6560 // legalize it.
6561 legalizeOpWithMove(MI, Idx);
6562 }
6563
6564 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6565 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6566 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6567 legalizeOpWithMove(MI, VOP3Idx[2]);
6568
6569 if (isWMMA(MI)) {
6570 // scale_src has a register class restricted to low 256 VGPRs, we may need
6571 // to insert a copy to the restricted VGPR class.
6572 int ScaleSrc0Idx =
6573 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src0);
6574 if (ScaleSrc0Idx != -1) {
6575 int ScaleSrc1Idx =
6576 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src1);
6577 if (!isOperandLegal(MI, ScaleSrc0Idx))
6578 legalizeOpWithMove(MI, ScaleSrc0Idx);
6579 if (!isOperandLegal(MI, ScaleSrc1Idx))
6580 legalizeOpWithMove(MI, ScaleSrc1Idx);
6581 }
6582 }
6583
6584 // Fix the register class of packed FP32 instructions on gfx12+. See
6585 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6587 for (unsigned I = 0; I < 3; ++I) {
6589 legalizeOpWithMove(MI, VOP3Idx[I]);
6590 }
6591 }
6592}
6593
6596 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6597 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6598 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6599 if (DstRC)
6600 SRC = RI.getCommonSubClass(SRC, DstRC);
6601
6602 Register DstReg = MRI.createVirtualRegister(SRC);
6603 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6604
6605 if (RI.hasAGPRs(VRC)) {
6606 VRC = RI.getEquivalentVGPRClass(VRC);
6607 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6608 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6609 get(TargetOpcode::COPY), NewSrcReg)
6610 .addReg(SrcReg);
6611 SrcReg = NewSrcReg;
6612 }
6613
6614 if (SubRegs == 1) {
6615 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6616 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6617 .addReg(SrcReg);
6618 return DstReg;
6619 }
6620
6622 for (unsigned i = 0; i < SubRegs; ++i) {
6623 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6624 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6625 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6626 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6627 SRegs.push_back(SGPR);
6628 }
6629
6631 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6632 get(AMDGPU::REG_SEQUENCE), DstReg);
6633 for (unsigned i = 0; i < SubRegs; ++i) {
6634 MIB.addReg(SRegs[i]);
6635 MIB.addImm(RI.getSubRegFromChannel(i));
6636 }
6637 return DstReg;
6638}
6639
6641 MachineInstr &MI) const {
6642
6643 // If the pointer is store in VGPRs, then we need to move them to
6644 // SGPRs using v_readfirstlane. This is safe because we only select
6645 // loads with uniform pointers to SMRD instruction so we know the
6646 // pointer value is uniform.
6647 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6648 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6649 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6650 SBase->setReg(SGPR);
6651 }
6652 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6653 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6654 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6655 SOff->setReg(SGPR);
6656 }
6657}
6658
6660 unsigned Opc = Inst.getOpcode();
6661 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6662 if (OldSAddrIdx < 0)
6663 return false;
6664
6665 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6666
6667 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6668 if (NewOpc < 0)
6670 if (NewOpc < 0)
6671 return false;
6672
6674 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6675 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6676 return false;
6677
6678 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6679 if (NewVAddrIdx < 0)
6680 return false;
6681
6682 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6683
6684 // Check vaddr, it shall be zero or absent.
6685 MachineInstr *VAddrDef = nullptr;
6686 if (OldVAddrIdx >= 0) {
6687 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6688 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6689 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6690 !VAddrDef->getOperand(1).isImm() ||
6691 VAddrDef->getOperand(1).getImm() != 0)
6692 return false;
6693 }
6694
6695 const MCInstrDesc &NewDesc = get(NewOpc);
6696 Inst.setDesc(NewDesc);
6697
6698 // Callers expect iterator to be valid after this call, so modify the
6699 // instruction in place.
6700 if (OldVAddrIdx == NewVAddrIdx) {
6701 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6702 // Clear use list from the old vaddr holding a zero register.
6703 MRI.removeRegOperandFromUseList(&NewVAddr);
6704 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6705 Inst.removeOperand(OldSAddrIdx);
6706 // Update the use list with the pointer we have just moved from vaddr to
6707 // saddr position. Otherwise new vaddr will be missing from the use list.
6708 MRI.removeRegOperandFromUseList(&NewVAddr);
6709 MRI.addRegOperandToUseList(&NewVAddr);
6710 } else {
6711 assert(OldSAddrIdx == NewVAddrIdx);
6712
6713 if (OldVAddrIdx >= 0) {
6714 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6715 AMDGPU::OpName::vdst_in);
6716
6717 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6718 // it asserts. Untie the operands for now and retie them afterwards.
6719 if (NewVDstIn != -1) {
6720 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6721 Inst.untieRegOperand(OldVDstIn);
6722 }
6723
6724 Inst.removeOperand(OldVAddrIdx);
6725
6726 if (NewVDstIn != -1) {
6727 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6728 Inst.tieOperands(NewVDst, NewVDstIn);
6729 }
6730 }
6731 }
6732
6733 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6734 VAddrDef->eraseFromParent();
6735
6736 return true;
6737}
6738
6739// FIXME: Remove this when SelectionDAG is obsoleted.
6741 MachineInstr &MI) const {
6742 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6743 return;
6744
6745 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6746 // thinks they are uniform, so a readfirstlane should be valid.
6747 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6748 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6749 return;
6750
6752 return;
6753
6754 const TargetRegisterClass *DeclaredRC =
6755 getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI);
6756
6757 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6758 SAddr->setReg(ToSGPR);
6759}
6760
6763 const TargetRegisterClass *DstRC,
6766 const DebugLoc &DL) const {
6767 Register OpReg = Op.getReg();
6768 unsigned OpSubReg = Op.getSubReg();
6769
6770 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6771 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6772
6773 // Check if operand is already the correct register class.
6774 if (DstRC == OpRC)
6775 return;
6776
6777 Register DstReg = MRI.createVirtualRegister(DstRC);
6778 auto Copy =
6779 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6780 Op.setReg(DstReg);
6781
6782 MachineInstr *Def = MRI.getVRegDef(OpReg);
6783 if (!Def)
6784 return;
6785
6786 // Try to eliminate the copy if it is copying an immediate value.
6787 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6788 foldImmediate(*Copy, *Def, OpReg, &MRI);
6789
6790 bool ImpDef = Def->isImplicitDef();
6791 while (!ImpDef && Def && Def->isCopy()) {
6792 if (Def->getOperand(1).getReg().isPhysical())
6793 break;
6794 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6795 ImpDef = Def && Def->isImplicitDef();
6796 }
6797 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6798 !ImpDef)
6799 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6800}
6801
6802// Emit the actual waterfall loop, executing the wrapped instruction for each
6803// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6804// iteration, in the worst case we execute 64 (once per lane).
6805static void
6808 MachineBasicBlock &LoopBB,
6809 MachineBasicBlock &BodyBB,
6810 const DebugLoc &DL,
6811 ArrayRef<MachineOperand *> ScalarOps) {
6812 MachineFunction &MF = *LoopBB.getParent();
6813 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6814 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6815 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6816 unsigned SaveExecOpc =
6817 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6818 unsigned XorTermOpc =
6819 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6820 unsigned AndOpc =
6821 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6822 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6823
6825 Register CondReg;
6826
6827 for (MachineOperand *ScalarOp : ScalarOps) {
6828 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6829 unsigned NumSubRegs = RegSize / 32;
6830 Register VScalarOp = ScalarOp->getReg();
6831
6832 if (NumSubRegs == 1) {
6833 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6834
6835 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6836 .addReg(VScalarOp);
6837
6838 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6839
6840 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6841 .addReg(CurReg)
6842 .addReg(VScalarOp);
6843
6844 // Combine the comparison results with AND.
6845 if (!CondReg) // First.
6846 CondReg = NewCondReg;
6847 else { // If not the first, we create an AND.
6848 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6849 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6850 .addReg(CondReg)
6851 .addReg(NewCondReg);
6852 CondReg = AndReg;
6853 }
6854
6855 // Update ScalarOp operand to use the SGPR ScalarOp.
6856 ScalarOp->setReg(CurReg);
6857 ScalarOp->setIsKill();
6858 } else {
6859 SmallVector<Register, 8> ReadlanePieces;
6860 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6861 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6862 "Unhandled register size");
6863
6864 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6865 Register CurRegLo =
6866 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6867 Register CurRegHi =
6868 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6869
6870 // Read the next variant <- also loop target.
6871 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6872 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6873
6874 // Read the next variant <- also loop target.
6875 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6876 .addReg(VScalarOp, VScalarOpUndef,
6877 TRI->getSubRegFromChannel(Idx + 1));
6878
6879 ReadlanePieces.push_back(CurRegLo);
6880 ReadlanePieces.push_back(CurRegHi);
6881
6882 // Comparison is to be done as 64-bit.
6883 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6884 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6885 .addReg(CurRegLo)
6886 .addImm(AMDGPU::sub0)
6887 .addReg(CurRegHi)
6888 .addImm(AMDGPU::sub1);
6889
6890 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6891 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6892 NewCondReg)
6893 .addReg(CurReg);
6894 if (NumSubRegs <= 2)
6895 Cmp.addReg(VScalarOp);
6896 else
6897 Cmp.addReg(VScalarOp, VScalarOpUndef,
6898 TRI->getSubRegFromChannel(Idx, 2));
6899
6900 // Combine the comparison results with AND.
6901 if (!CondReg) // First.
6902 CondReg = NewCondReg;
6903 else { // If not the first, we create an AND.
6904 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6905 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6906 .addReg(CondReg)
6907 .addReg(NewCondReg);
6908 CondReg = AndReg;
6909 }
6910 } // End for loop.
6911
6912 const auto *SScalarOpRC =
6913 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6914 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6915
6916 // Build scalar ScalarOp.
6917 auto Merge =
6918 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6919 unsigned Channel = 0;
6920 for (Register Piece : ReadlanePieces) {
6921 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6922 }
6923
6924 // Update ScalarOp operand to use the SGPR ScalarOp.
6925 ScalarOp->setReg(SScalarOp);
6926 ScalarOp->setIsKill();
6927 }
6928 }
6929
6930 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6931 MRI.setSimpleHint(SaveExec, CondReg);
6932
6933 // Update EXEC to matching lanes, saving original to SaveExec.
6934 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6935 .addReg(CondReg, RegState::Kill);
6936
6937 // The original instruction is here; we insert the terminators after it.
6938 I = BodyBB.end();
6939
6940 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6941 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6942 .addReg(Exec)
6943 .addReg(SaveExec);
6944
6945 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6946}
6947
6948// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6949// with SGPRs by iterating over all unique values across all lanes.
6950// Returns the loop basic block that now contains \p MI.
6951static MachineBasicBlock *
6955 MachineBasicBlock::iterator Begin = nullptr,
6956 MachineBasicBlock::iterator End = nullptr) {
6957 MachineBasicBlock &MBB = *MI.getParent();
6958 MachineFunction &MF = *MBB.getParent();
6959 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6960 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6962 if (!Begin.isValid())
6963 Begin = &MI;
6964 if (!End.isValid()) {
6965 End = &MI;
6966 ++End;
6967 }
6968 const DebugLoc &DL = MI.getDebugLoc();
6969 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6970 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6971 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6972
6973 // Save SCC. Waterfall Loop may overwrite SCC.
6974 Register SaveSCCReg;
6975
6976 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
6977 // rather than unlimited scan everywhere
6978 bool SCCNotDead =
6979 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
6980 std::numeric_limits<unsigned>::max()) !=
6982 if (SCCNotDead) {
6983 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6984 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6985 .addImm(1)
6986 .addImm(0);
6987 }
6988
6989 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6990
6991 // Save the EXEC mask
6992 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
6993
6994 // Killed uses in the instruction we are waterfalling around will be
6995 // incorrect due to the added control-flow.
6997 ++AfterMI;
6998 for (auto I = Begin; I != AfterMI; I++) {
6999 for (auto &MO : I->all_uses())
7000 MRI.clearKillFlags(MO.getReg());
7001 }
7002
7003 // To insert the loop we need to split the block. Move everything after this
7004 // point to a new block, and insert a new empty block between the two.
7007 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7009 ++MBBI;
7010
7011 MF.insert(MBBI, LoopBB);
7012 MF.insert(MBBI, BodyBB);
7013 MF.insert(MBBI, RemainderBB);
7014
7015 LoopBB->addSuccessor(BodyBB);
7016 BodyBB->addSuccessor(LoopBB);
7017 BodyBB->addSuccessor(RemainderBB);
7018
7019 // Move Begin to MI to the BodyBB, and the remainder of the block to
7020 // RemainderBB.
7021 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7022 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7023 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7024
7025 MBB.addSuccessor(LoopBB);
7026
7027 // Update dominators. We know that MBB immediately dominates LoopBB, that
7028 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7029 // RemainderBB. RemainderBB immediately dominates all of the successors
7030 // transferred to it from MBB that MBB used to properly dominate.
7031 if (MDT) {
7032 MDT->addNewBlock(LoopBB, &MBB);
7033 MDT->addNewBlock(BodyBB, LoopBB);
7034 MDT->addNewBlock(RemainderBB, BodyBB);
7035 for (auto &Succ : RemainderBB->successors()) {
7036 if (MDT->properlyDominates(&MBB, Succ)) {
7037 MDT->changeImmediateDominator(Succ, RemainderBB);
7038 }
7039 }
7040 }
7041
7042 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7043
7044 MachineBasicBlock::iterator First = RemainderBB->begin();
7045 // Restore SCC
7046 if (SCCNotDead) {
7047 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7048 .addReg(SaveSCCReg, RegState::Kill)
7049 .addImm(0);
7050 }
7051
7052 // Restore the EXEC mask
7053 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
7054 return BodyBB;
7055}
7056
7057// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7058static std::tuple<unsigned, unsigned>
7060 MachineBasicBlock &MBB = *MI.getParent();
7061 MachineFunction &MF = *MBB.getParent();
7063
7064 // Extract the ptr from the resource descriptor.
7065 unsigned RsrcPtr =
7066 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7067 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7068
7069 // Create an empty resource descriptor
7070 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7071 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7072 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7073 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7074 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7075
7076 // Zero64 = 0
7077 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7078 .addImm(0);
7079
7080 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7081 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7082 .addImm(Lo_32(RsrcDataFormat));
7083
7084 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7085 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7086 .addImm(Hi_32(RsrcDataFormat));
7087
7088 // NewSRsrc = {Zero64, SRsrcFormat}
7089 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7090 .addReg(Zero64)
7091 .addImm(AMDGPU::sub0_sub1)
7092 .addReg(SRsrcFormatLo)
7093 .addImm(AMDGPU::sub2)
7094 .addReg(SRsrcFormatHi)
7095 .addImm(AMDGPU::sub3);
7096
7097 return std::tuple(RsrcPtr, NewSRsrc);
7098}
7099
7102 MachineDominatorTree *MDT) const {
7103 MachineFunction &MF = *MI.getParent()->getParent();
7105 MachineBasicBlock *CreatedBB = nullptr;
7106
7107 // Legalize VOP2
7108 if (isVOP2(MI) || isVOPC(MI)) {
7110 return CreatedBB;
7111 }
7112
7113 // Legalize VOP3
7114 if (isVOP3(MI)) {
7116 return CreatedBB;
7117 }
7118
7119 // Legalize SMRD
7120 if (isSMRD(MI)) {
7122 return CreatedBB;
7123 }
7124
7125 // Legalize FLAT
7126 if (isFLAT(MI)) {
7128 return CreatedBB;
7129 }
7130
7131 // Legalize REG_SEQUENCE and PHI
7132 // The register class of the operands much be the same type as the register
7133 // class of the output.
7134 if (MI.getOpcode() == AMDGPU::PHI) {
7135 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
7136 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
7137 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
7138 continue;
7139 const TargetRegisterClass *OpRC =
7140 MRI.getRegClass(MI.getOperand(i).getReg());
7141 if (RI.hasVectorRegisters(OpRC)) {
7142 VRC = OpRC;
7143 } else {
7144 SRC = OpRC;
7145 }
7146 }
7147
7148 // If any of the operands are VGPR registers, then they all most be
7149 // otherwise we will create illegal VGPR->SGPR copies when legalizing
7150 // them.
7151 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
7152 if (!VRC) {
7153 assert(SRC);
7154 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
7155 VRC = &AMDGPU::VReg_1RegClass;
7156 } else
7157 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7158 ? RI.getEquivalentAGPRClass(SRC)
7159 : RI.getEquivalentVGPRClass(SRC);
7160 } else {
7161 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7162 ? RI.getEquivalentAGPRClass(VRC)
7163 : RI.getEquivalentVGPRClass(VRC);
7164 }
7165 RC = VRC;
7166 } else {
7167 RC = SRC;
7168 }
7169
7170 // Update all the operands so they have the same type.
7171 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7172 MachineOperand &Op = MI.getOperand(I);
7173 if (!Op.isReg() || !Op.getReg().isVirtual())
7174 continue;
7175
7176 // MI is a PHI instruction.
7177 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7179
7180 // Avoid creating no-op copies with the same src and dst reg class. These
7181 // confuse some of the machine passes.
7182 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
7183 }
7184 }
7185
7186 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7187 // VGPR dest type and SGPR sources, insert copies so all operands are
7188 // VGPRs. This seems to help operand folding / the register coalescer.
7189 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7190 MachineBasicBlock *MBB = MI.getParent();
7191 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7192 if (RI.hasVGPRs(DstRC)) {
7193 // Update all the operands so they are VGPR register classes. These may
7194 // not be the same register class because REG_SEQUENCE supports mixing
7195 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7196 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7197 MachineOperand &Op = MI.getOperand(I);
7198 if (!Op.isReg() || !Op.getReg().isVirtual())
7199 continue;
7200
7201 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7202 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7203 if (VRC == OpRC)
7204 continue;
7205
7206 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7207 Op.setIsKill();
7208 }
7209 }
7210
7211 return CreatedBB;
7212 }
7213
7214 // Legalize INSERT_SUBREG
7215 // src0 must have the same register class as dst
7216 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7217 Register Dst = MI.getOperand(0).getReg();
7218 Register Src0 = MI.getOperand(1).getReg();
7219 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7220 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7221 if (DstRC != Src0RC) {
7222 MachineBasicBlock *MBB = MI.getParent();
7223 MachineOperand &Op = MI.getOperand(1);
7224 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7225 }
7226 return CreatedBB;
7227 }
7228
7229 // Legalize SI_INIT_M0
7230 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7231 MachineOperand &Src = MI.getOperand(0);
7232 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7233 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7234 return CreatedBB;
7235 }
7236
7237 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7238 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7239 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7240 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7241 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7242 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7243 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7244 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7245 MachineOperand &Src = MI.getOperand(1);
7246 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7247 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7248 return CreatedBB;
7249 }
7250
7251 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7252 //
7253 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7254 // scratch memory access. In both cases, the legalization never involves
7255 // conversion to the addr64 form.
7257 (isMUBUF(MI) || isMTBUF(MI)))) {
7258 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7259 ? AMDGPU::OpName::rsrc
7260 : AMDGPU::OpName::srsrc;
7261 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7262 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7263 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7264
7265 AMDGPU::OpName SampOpName =
7266 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7267 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7268 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7269 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7270
7271 return CreatedBB;
7272 }
7273
7274 // Legalize SI_CALL
7275 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7276 MachineOperand *Dest = &MI.getOperand(0);
7277 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7278 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7279 // following copies, we also need to move copies from and to physical
7280 // registers into the loop block.
7281 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7282 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7283
7284 // Also move the copies to physical registers into the loop block
7285 MachineBasicBlock &MBB = *MI.getParent();
7287 while (Start->getOpcode() != FrameSetupOpcode)
7288 --Start;
7290 while (End->getOpcode() != FrameDestroyOpcode)
7291 ++End;
7292 // Also include following copies of the return value
7293 ++End;
7294 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7295 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7296 ++End;
7297 CreatedBB =
7298 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7299 }
7300 }
7301
7302 // Legalize s_sleep_var.
7303 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7304 const DebugLoc &DL = MI.getDebugLoc();
7305 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7306 int Src0Idx =
7307 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7308 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7309 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7310 .add(Src0);
7311 Src0.ChangeToRegister(Reg, false);
7312 return nullptr;
7313 }
7314
7315 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7316 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7317 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7318 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7319 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7320 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7321 for (MachineOperand &Src : MI.explicit_operands()) {
7322 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7323 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7324 }
7325 return CreatedBB;
7326 }
7327
7328 // Legalize MUBUF instructions.
7329 bool isSoffsetLegal = true;
7330 int SoffsetIdx =
7331 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7332 if (SoffsetIdx != -1) {
7333 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7334 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7335 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7336 isSoffsetLegal = false;
7337 }
7338 }
7339
7340 bool isRsrcLegal = true;
7341 int RsrcIdx =
7342 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7343 if (RsrcIdx != -1) {
7344 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7345 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7346 isRsrcLegal = false;
7347 }
7348
7349 // The operands are legal.
7350 if (isRsrcLegal && isSoffsetLegal)
7351 return CreatedBB;
7352
7353 if (!isRsrcLegal) {
7354 // Legalize a VGPR Rsrc
7355 //
7356 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7357 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7358 // a zero-value SRsrc.
7359 //
7360 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7361 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7362 // above.
7363 //
7364 // Otherwise we are on non-ADDR64 hardware, and/or we have
7365 // idxen/offen/bothen and we fall back to a waterfall loop.
7366
7367 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7368 MachineBasicBlock &MBB = *MI.getParent();
7369
7370 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7371 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7372 // This is already an ADDR64 instruction so we need to add the pointer
7373 // extracted from the resource descriptor to the current value of VAddr.
7374 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7375 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7376 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7377
7378 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7379 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7380 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7381
7382 unsigned RsrcPtr, NewSRsrc;
7383 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7384
7385 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7386 const DebugLoc &DL = MI.getDebugLoc();
7387 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7388 .addDef(CondReg0)
7389 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7390 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7391 .addImm(0);
7392
7393 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7394 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7395 .addDef(CondReg1, RegState::Dead)
7396 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7397 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7398 .addReg(CondReg0, RegState::Kill)
7399 .addImm(0);
7400
7401 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7402 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7403 .addReg(NewVAddrLo)
7404 .addImm(AMDGPU::sub0)
7405 .addReg(NewVAddrHi)
7406 .addImm(AMDGPU::sub1);
7407
7408 VAddr->setReg(NewVAddr);
7409 Rsrc->setReg(NewSRsrc);
7410 } else if (!VAddr && ST.hasAddr64()) {
7411 // This instructions is the _OFFSET variant, so we need to convert it to
7412 // ADDR64.
7413 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7414 "FIXME: Need to emit flat atomics here");
7415
7416 unsigned RsrcPtr, NewSRsrc;
7417 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7418
7419 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7420 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7421 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7422 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7423 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7424
7425 // Atomics with return have an additional tied operand and are
7426 // missing some of the special bits.
7427 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7428 MachineInstr *Addr64;
7429
7430 if (!VDataIn) {
7431 // Regular buffer load / store.
7433 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7434 .add(*VData)
7435 .addReg(NewVAddr)
7436 .addReg(NewSRsrc)
7437 .add(*SOffset)
7438 .add(*Offset);
7439
7440 if (const MachineOperand *CPol =
7441 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7442 MIB.addImm(CPol->getImm());
7443 }
7444
7445 if (const MachineOperand *TFE =
7446 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7447 MIB.addImm(TFE->getImm());
7448 }
7449
7450 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7451
7452 MIB.cloneMemRefs(MI);
7453 Addr64 = MIB;
7454 } else {
7455 // Atomics with return.
7456 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7457 .add(*VData)
7458 .add(*VDataIn)
7459 .addReg(NewVAddr)
7460 .addReg(NewSRsrc)
7461 .add(*SOffset)
7462 .add(*Offset)
7463 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7464 .cloneMemRefs(MI);
7465 }
7466
7467 MI.removeFromParent();
7468
7469 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7470 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7471 NewVAddr)
7472 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7473 .addImm(AMDGPU::sub0)
7474 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7475 .addImm(AMDGPU::sub1);
7476 } else {
7477 // Legalize a VGPR Rsrc and soffset together.
7478 if (!isSoffsetLegal) {
7479 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7480 CreatedBB =
7481 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7482 return CreatedBB;
7483 }
7484 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7485 return CreatedBB;
7486 }
7487 }
7488
7489 // Legalize a VGPR soffset.
7490 if (!isSoffsetLegal) {
7491 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7492 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7493 return CreatedBB;
7494 }
7495 return CreatedBB;
7496}
7497
7499 InstrList.insert(MI);
7500 // Add MBUF instructiosn to deferred list.
7501 int RsrcIdx =
7502 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7503 if (RsrcIdx != -1) {
7504 DeferredList.insert(MI);
7505 }
7506}
7507
7509 return DeferredList.contains(MI);
7510}
7511
7512// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7513// lowering (change spgr to vgpr).
7514// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7515// size. Need to legalize the size of the operands during the vgpr lowering
7516// chain. This can be removed after we have sgpr16 in place
7518 MachineRegisterInfo &MRI) const {
7519 if (!ST.useRealTrue16Insts())
7520 return;
7521
7522 unsigned Opcode = MI.getOpcode();
7523 MachineBasicBlock *MBB = MI.getParent();
7524 // Legalize operands and check for size mismatch
7525 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7526 OpIdx >= get(Opcode).getNumOperands() ||
7527 get(Opcode).operands()[OpIdx].RegClass == -1)
7528 return;
7529
7530 MachineOperand &Op = MI.getOperand(OpIdx);
7531 if (!Op.isReg() || !Op.getReg().isVirtual())
7532 return;
7533
7534 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7535 if (!RI.isVGPRClass(CurrRC))
7536 return;
7537
7538 unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
7539 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7540 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7541 Op.setSubReg(AMDGPU::lo16);
7542 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7543 const DebugLoc &DL = MI.getDebugLoc();
7544 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7545 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7546 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7547 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7548 .addReg(Op.getReg())
7549 .addImm(AMDGPU::lo16)
7550 .addReg(Undef)
7551 .addImm(AMDGPU::hi16);
7552 Op.setReg(NewDstReg);
7553 }
7554}
7556 MachineRegisterInfo &MRI) const {
7557 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7559}
7560
7562 MachineDominatorTree *MDT) const {
7563
7564 while (!Worklist.empty()) {
7565 MachineInstr &Inst = *Worklist.top();
7566 Worklist.erase_top();
7567 // Skip MachineInstr in the deferred list.
7568 if (Worklist.isDeferred(&Inst))
7569 continue;
7570 moveToVALUImpl(Worklist, MDT, Inst);
7571 }
7572
7573 // Deferred list of instructions will be processed once
7574 // all the MachineInstr in the worklist are done.
7575 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7576 moveToVALUImpl(Worklist, MDT, *Inst);
7577 assert(Worklist.empty() &&
7578 "Deferred MachineInstr are not supposed to re-populate worklist");
7579 }
7580}
7581
7584 MachineInstr &Inst) const {
7585
7587 if (!MBB)
7588 return;
7589 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7590 unsigned Opcode = Inst.getOpcode();
7591 unsigned NewOpcode = getVALUOp(Inst);
7592 // Handle some special cases
7593 switch (Opcode) {
7594 default:
7595 break;
7596 case AMDGPU::S_ADD_I32:
7597 case AMDGPU::S_SUB_I32: {
7598 // FIXME: The u32 versions currently selected use the carry.
7599 bool Changed;
7600 MachineBasicBlock *CreatedBBTmp = nullptr;
7601 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7602 if (Changed)
7603 return;
7604
7605 // Default handling
7606 break;
7607 }
7608
7609 case AMDGPU::S_MUL_U64:
7610 if (ST.hasVectorMulU64()) {
7611 NewOpcode = AMDGPU::V_MUL_U64_e64;
7612 break;
7613 }
7614 // Split s_mul_u64 in 32-bit vector multiplications.
7615 splitScalarSMulU64(Worklist, Inst, MDT);
7616 Inst.eraseFromParent();
7617 return;
7618
7619 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7620 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7621 // This is a special case of s_mul_u64 where all the operands are either
7622 // zero extended or sign extended.
7623 splitScalarSMulPseudo(Worklist, Inst, MDT);
7624 Inst.eraseFromParent();
7625 return;
7626
7627 case AMDGPU::S_AND_B64:
7628 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7629 Inst.eraseFromParent();
7630 return;
7631
7632 case AMDGPU::S_OR_B64:
7633 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7634 Inst.eraseFromParent();
7635 return;
7636
7637 case AMDGPU::S_XOR_B64:
7638 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7639 Inst.eraseFromParent();
7640 return;
7641
7642 case AMDGPU::S_NAND_B64:
7643 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7644 Inst.eraseFromParent();
7645 return;
7646
7647 case AMDGPU::S_NOR_B64:
7648 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7649 Inst.eraseFromParent();
7650 return;
7651
7652 case AMDGPU::S_XNOR_B64:
7653 if (ST.hasDLInsts())
7654 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7655 else
7656 splitScalar64BitXnor(Worklist, Inst, MDT);
7657 Inst.eraseFromParent();
7658 return;
7659
7660 case AMDGPU::S_ANDN2_B64:
7661 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7662 Inst.eraseFromParent();
7663 return;
7664
7665 case AMDGPU::S_ORN2_B64:
7666 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7667 Inst.eraseFromParent();
7668 return;
7669
7670 case AMDGPU::S_BREV_B64:
7671 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7672 Inst.eraseFromParent();
7673 return;
7674
7675 case AMDGPU::S_NOT_B64:
7676 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7677 Inst.eraseFromParent();
7678 return;
7679
7680 case AMDGPU::S_BCNT1_I32_B64:
7681 splitScalar64BitBCNT(Worklist, Inst);
7682 Inst.eraseFromParent();
7683 return;
7684
7685 case AMDGPU::S_BFE_I64:
7686 splitScalar64BitBFE(Worklist, Inst);
7687 Inst.eraseFromParent();
7688 return;
7689
7690 case AMDGPU::S_FLBIT_I32_B64:
7691 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7692 Inst.eraseFromParent();
7693 return;
7694 case AMDGPU::S_FF1_I32_B64:
7695 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7696 Inst.eraseFromParent();
7697 return;
7698
7699 case AMDGPU::S_LSHL_B32:
7700 if (ST.hasOnlyRevVALUShifts()) {
7701 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7702 swapOperands(Inst);
7703 }
7704 break;
7705 case AMDGPU::S_ASHR_I32:
7706 if (ST.hasOnlyRevVALUShifts()) {
7707 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7708 swapOperands(Inst);
7709 }
7710 break;
7711 case AMDGPU::S_LSHR_B32:
7712 if (ST.hasOnlyRevVALUShifts()) {
7713 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7714 swapOperands(Inst);
7715 }
7716 break;
7717 case AMDGPU::S_LSHL_B64:
7718 if (ST.hasOnlyRevVALUShifts()) {
7719 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7720 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7721 : AMDGPU::V_LSHLREV_B64_e64;
7722 swapOperands(Inst);
7723 }
7724 break;
7725 case AMDGPU::S_ASHR_I64:
7726 if (ST.hasOnlyRevVALUShifts()) {
7727 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7728 swapOperands(Inst);
7729 }
7730 break;
7731 case AMDGPU::S_LSHR_B64:
7732 if (ST.hasOnlyRevVALUShifts()) {
7733 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7734 swapOperands(Inst);
7735 }
7736 break;
7737
7738 case AMDGPU::S_ABS_I32:
7739 lowerScalarAbs(Worklist, Inst);
7740 Inst.eraseFromParent();
7741 return;
7742
7743 case AMDGPU::S_CBRANCH_SCC0:
7744 case AMDGPU::S_CBRANCH_SCC1: {
7745 // Clear unused bits of vcc
7746 Register CondReg = Inst.getOperand(1).getReg();
7747 bool IsSCC = CondReg == AMDGPU::SCC;
7748 Register VCC = RI.getVCC();
7749 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7750 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7751 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7752 .addReg(EXEC)
7753 .addReg(IsSCC ? VCC : CondReg);
7754 Inst.removeOperand(1);
7755 } break;
7756
7757 case AMDGPU::S_BFE_U64:
7758 case AMDGPU::S_BFM_B64:
7759 llvm_unreachable("Moving this op to VALU not implemented");
7760
7761 case AMDGPU::S_PACK_LL_B32_B16:
7762 case AMDGPU::S_PACK_LH_B32_B16:
7763 case AMDGPU::S_PACK_HL_B32_B16:
7764 case AMDGPU::S_PACK_HH_B32_B16:
7765 movePackToVALU(Worklist, MRI, Inst);
7766 Inst.eraseFromParent();
7767 return;
7768
7769 case AMDGPU::S_XNOR_B32:
7770 lowerScalarXnor(Worklist, Inst);
7771 Inst.eraseFromParent();
7772 return;
7773
7774 case AMDGPU::S_NAND_B32:
7775 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7776 Inst.eraseFromParent();
7777 return;
7778
7779 case AMDGPU::S_NOR_B32:
7780 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7781 Inst.eraseFromParent();
7782 return;
7783
7784 case AMDGPU::S_ANDN2_B32:
7785 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7786 Inst.eraseFromParent();
7787 return;
7788
7789 case AMDGPU::S_ORN2_B32:
7790 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7791 Inst.eraseFromParent();
7792 return;
7793
7794 // TODO: remove as soon as everything is ready
7795 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7796 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7797 // can only be selected from the uniform SDNode.
7798 case AMDGPU::S_ADD_CO_PSEUDO:
7799 case AMDGPU::S_SUB_CO_PSEUDO: {
7800 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7801 ? AMDGPU::V_ADDC_U32_e64
7802 : AMDGPU::V_SUBB_U32_e64;
7803 const auto *CarryRC = RI.getWaveMaskRegClass();
7804
7805 Register CarryInReg = Inst.getOperand(4).getReg();
7806 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7807 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7808 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7809 .addReg(CarryInReg);
7810 }
7811
7812 Register CarryOutReg = Inst.getOperand(1).getReg();
7813
7814 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7815 MRI.getRegClass(Inst.getOperand(0).getReg())));
7816 MachineInstr *CarryOp =
7817 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7818 .addReg(CarryOutReg, RegState::Define)
7819 .add(Inst.getOperand(2))
7820 .add(Inst.getOperand(3))
7821 .addReg(CarryInReg)
7822 .addImm(0);
7823 legalizeOperands(*CarryOp);
7824 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7825 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7826 Inst.eraseFromParent();
7827 }
7828 return;
7829 case AMDGPU::S_UADDO_PSEUDO:
7830 case AMDGPU::S_USUBO_PSEUDO: {
7831 const DebugLoc &DL = Inst.getDebugLoc();
7832 MachineOperand &Dest0 = Inst.getOperand(0);
7833 MachineOperand &Dest1 = Inst.getOperand(1);
7834 MachineOperand &Src0 = Inst.getOperand(2);
7835 MachineOperand &Src1 = Inst.getOperand(3);
7836
7837 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7838 ? AMDGPU::V_ADD_CO_U32_e64
7839 : AMDGPU::V_SUB_CO_U32_e64;
7840 const TargetRegisterClass *NewRC =
7841 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7842 Register DestReg = MRI.createVirtualRegister(NewRC);
7843 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7844 .addReg(Dest1.getReg(), RegState::Define)
7845 .add(Src0)
7846 .add(Src1)
7847 .addImm(0); // clamp bit
7848
7849 legalizeOperands(*NewInstr, MDT);
7850 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7851 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7852 Worklist);
7853 Inst.eraseFromParent();
7854 }
7855 return;
7856
7857 case AMDGPU::S_CSELECT_B32:
7858 case AMDGPU::S_CSELECT_B64:
7859 lowerSelect(Worklist, Inst, MDT);
7860 Inst.eraseFromParent();
7861 return;
7862 case AMDGPU::S_CMP_EQ_I32:
7863 case AMDGPU::S_CMP_LG_I32:
7864 case AMDGPU::S_CMP_GT_I32:
7865 case AMDGPU::S_CMP_GE_I32:
7866 case AMDGPU::S_CMP_LT_I32:
7867 case AMDGPU::S_CMP_LE_I32:
7868 case AMDGPU::S_CMP_EQ_U32:
7869 case AMDGPU::S_CMP_LG_U32:
7870 case AMDGPU::S_CMP_GT_U32:
7871 case AMDGPU::S_CMP_GE_U32:
7872 case AMDGPU::S_CMP_LT_U32:
7873 case AMDGPU::S_CMP_LE_U32:
7874 case AMDGPU::S_CMP_EQ_U64:
7875 case AMDGPU::S_CMP_LG_U64:
7876 case AMDGPU::S_CMP_LT_F32:
7877 case AMDGPU::S_CMP_EQ_F32:
7878 case AMDGPU::S_CMP_LE_F32:
7879 case AMDGPU::S_CMP_GT_F32:
7880 case AMDGPU::S_CMP_LG_F32:
7881 case AMDGPU::S_CMP_GE_F32:
7882 case AMDGPU::S_CMP_O_F32:
7883 case AMDGPU::S_CMP_U_F32:
7884 case AMDGPU::S_CMP_NGE_F32:
7885 case AMDGPU::S_CMP_NLG_F32:
7886 case AMDGPU::S_CMP_NGT_F32:
7887 case AMDGPU::S_CMP_NLE_F32:
7888 case AMDGPU::S_CMP_NEQ_F32:
7889 case AMDGPU::S_CMP_NLT_F32: {
7890 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7891 auto NewInstr =
7892 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7893 .setMIFlags(Inst.getFlags());
7894 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
7895 0) {
7896 NewInstr
7897 .addImm(0) // src0_modifiers
7898 .add(Inst.getOperand(0)) // src0
7899 .addImm(0) // src1_modifiers
7900 .add(Inst.getOperand(1)) // src1
7901 .addImm(0); // clamp
7902 } else {
7903 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
7904 }
7905 legalizeOperands(*NewInstr, MDT);
7906 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7907 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7908 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7909 Inst.eraseFromParent();
7910 return;
7911 }
7912 case AMDGPU::S_CMP_LT_F16:
7913 case AMDGPU::S_CMP_EQ_F16:
7914 case AMDGPU::S_CMP_LE_F16:
7915 case AMDGPU::S_CMP_GT_F16:
7916 case AMDGPU::S_CMP_LG_F16:
7917 case AMDGPU::S_CMP_GE_F16:
7918 case AMDGPU::S_CMP_O_F16:
7919 case AMDGPU::S_CMP_U_F16:
7920 case AMDGPU::S_CMP_NGE_F16:
7921 case AMDGPU::S_CMP_NLG_F16:
7922 case AMDGPU::S_CMP_NGT_F16:
7923 case AMDGPU::S_CMP_NLE_F16:
7924 case AMDGPU::S_CMP_NEQ_F16:
7925 case AMDGPU::S_CMP_NLT_F16: {
7926 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7927 auto NewInstr =
7928 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7929 .setMIFlags(Inst.getFlags());
7930 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
7931 NewInstr
7932 .addImm(0) // src0_modifiers
7933 .add(Inst.getOperand(0)) // src0
7934 .addImm(0) // src1_modifiers
7935 .add(Inst.getOperand(1)) // src1
7936 .addImm(0); // clamp
7937 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
7938 NewInstr.addImm(0); // op_sel0
7939 } else {
7940 NewInstr
7941 .add(Inst.getOperand(0))
7942 .add(Inst.getOperand(1));
7943 }
7944 legalizeOperandsVALUt16(*NewInstr, MRI);
7945 legalizeOperands(*NewInstr, MDT);
7946 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7947 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7948 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7949 Inst.eraseFromParent();
7950 return;
7951 }
7952 case AMDGPU::S_CVT_HI_F32_F16: {
7953 const DebugLoc &DL = Inst.getDebugLoc();
7954 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7955 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7956 if (ST.useRealTrue16Insts()) {
7957 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
7958 .add(Inst.getOperand(1));
7959 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7960 .addImm(0) // src0_modifiers
7961 .addReg(TmpReg, 0, AMDGPU::hi16)
7962 .addImm(0) // clamp
7963 .addImm(0) // omod
7964 .addImm(0); // op_sel0
7965 } else {
7966 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7967 .addImm(16)
7968 .add(Inst.getOperand(1));
7969 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7970 .addImm(0) // src0_modifiers
7971 .addReg(TmpReg)
7972 .addImm(0) // clamp
7973 .addImm(0); // omod
7974 }
7975
7976 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7977 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7978 Inst.eraseFromParent();
7979 return;
7980 }
7981 case AMDGPU::S_MINIMUM_F32:
7982 case AMDGPU::S_MAXIMUM_F32: {
7983 const DebugLoc &DL = Inst.getDebugLoc();
7984 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7985 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7986 .addImm(0) // src0_modifiers
7987 .add(Inst.getOperand(1))
7988 .addImm(0) // src1_modifiers
7989 .add(Inst.getOperand(2))
7990 .addImm(0) // clamp
7991 .addImm(0); // omod
7992 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7993
7994 legalizeOperands(*NewInstr, MDT);
7995 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7996 Inst.eraseFromParent();
7997 return;
7998 }
7999 case AMDGPU::S_MINIMUM_F16:
8000 case AMDGPU::S_MAXIMUM_F16: {
8001 const DebugLoc &DL = Inst.getDebugLoc();
8002 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8003 ? &AMDGPU::VGPR_16RegClass
8004 : &AMDGPU::VGPR_32RegClass);
8005 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8006 .addImm(0) // src0_modifiers
8007 .add(Inst.getOperand(1))
8008 .addImm(0) // src1_modifiers
8009 .add(Inst.getOperand(2))
8010 .addImm(0) // clamp
8011 .addImm(0) // omod
8012 .addImm(0); // opsel0
8013 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8014 legalizeOperandsVALUt16(*NewInstr, MRI);
8015 legalizeOperands(*NewInstr, MDT);
8016 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8017 Inst.eraseFromParent();
8018 return;
8019 }
8020 case AMDGPU::V_S_EXP_F16_e64:
8021 case AMDGPU::V_S_LOG_F16_e64:
8022 case AMDGPU::V_S_RCP_F16_e64:
8023 case AMDGPU::V_S_RSQ_F16_e64:
8024 case AMDGPU::V_S_SQRT_F16_e64: {
8025 const DebugLoc &DL = Inst.getDebugLoc();
8026 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8027 ? &AMDGPU::VGPR_16RegClass
8028 : &AMDGPU::VGPR_32RegClass);
8029 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8030 .add(Inst.getOperand(1)) // src0_modifiers
8031 .add(Inst.getOperand(2))
8032 .add(Inst.getOperand(3)) // clamp
8033 .add(Inst.getOperand(4)) // omod
8034 .setMIFlags(Inst.getFlags());
8035 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8036 NewInstr.addImm(0); // opsel0
8037 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8038 legalizeOperandsVALUt16(*NewInstr, MRI);
8039 legalizeOperands(*NewInstr, MDT);
8040 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8041 Inst.eraseFromParent();
8042 return;
8043 }
8044 }
8045
8046 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8047 // We cannot move this instruction to the VALU, so we should try to
8048 // legalize its operands instead.
8049 legalizeOperands(Inst, MDT);
8050 return;
8051 }
8052 // Handle converting generic instructions like COPY-to-SGPR into
8053 // COPY-to-VGPR.
8054 if (NewOpcode == Opcode) {
8055 Register DstReg = Inst.getOperand(0).getReg();
8056 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8057
8058 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8059 // hope for the best.
8060 if (Inst.isCopy() && DstReg.isPhysical() &&
8061 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8062 // TODO: Only works for 32 bit registers.
8063 if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) {
8064 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8065 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
8066 .add(Inst.getOperand(1));
8067 } else {
8068 Register NewDst =
8069 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8070 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8071 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8072 .add(Inst.getOperand(1));
8073 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8074 DstReg)
8075 .addReg(NewDst);
8076 }
8077 Inst.eraseFromParent();
8078 return;
8079 }
8080
8081 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
8082 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
8083 // Instead of creating a copy where src and dst are the same register
8084 // class, we just replace all uses of dst with src. These kinds of
8085 // copies interfere with the heuristics MachineSink uses to decide
8086 // whether or not to split a critical edge. Since the pass assumes
8087 // that copies will end up as machine instructions and not be
8088 // eliminated.
8089 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8090 Register NewDstReg = Inst.getOperand(1).getReg();
8091 MRI.replaceRegWith(DstReg, NewDstReg);
8092 MRI.clearKillFlags(NewDstReg);
8093 Inst.getOperand(0).setReg(DstReg);
8094 Inst.eraseFromParent();
8095 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8096 for (MachineOperand &MO :
8097 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8098 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8099 }
8100 return;
8101 }
8102
8103 // If this is a v2s copy between 16bit and 32bit reg,
8104 // replace vgpr copy to reg_sequence/extract_subreg
8105 // This can be remove after we have sgpr16 in place
8106 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8107 Inst.getOperand(1).getReg().isVirtual() &&
8108 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8109 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8110 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8111 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8112 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8113 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8114 get(AMDGPU::IMPLICIT_DEF), Undef);
8115 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8116 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8117 .addReg(Inst.getOperand(1).getReg())
8118 .addImm(AMDGPU::lo16)
8119 .addReg(Undef)
8120 .addImm(AMDGPU::hi16);
8121 Inst.eraseFromParent();
8122 MRI.replaceRegWith(DstReg, NewDstReg);
8123 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8124 return;
8125 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8126 AMDGPU::lo16)) {
8127 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8128 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8129 MRI.replaceRegWith(DstReg, NewDstReg);
8130 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8131 return;
8132 }
8133 }
8134
8135 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8136 MRI.replaceRegWith(DstReg, NewDstReg);
8137 legalizeOperands(Inst, MDT);
8138 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8139 return;
8140 }
8141
8142 // Use the new VALU Opcode.
8143 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8144 .setMIFlags(Inst.getFlags());
8145 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8146 // Intersperse VOP3 modifiers among the SALU operands.
8147 NewInstr->addOperand(Inst.getOperand(0));
8148 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8149 AMDGPU::OpName::src0_modifiers) >= 0)
8150 NewInstr.addImm(0);
8151 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8152 MachineOperand Src = Inst.getOperand(1);
8153 NewInstr->addOperand(Src);
8154 }
8155
8156 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8157 // We are converting these to a BFE, so we need to add the missing
8158 // operands for the size and offset.
8159 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8160 NewInstr.addImm(0);
8161 NewInstr.addImm(Size);
8162 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8163 // The VALU version adds the second operand to the result, so insert an
8164 // extra 0 operand.
8165 NewInstr.addImm(0);
8166 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8167 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8168 // If we need to move this to VGPRs, we need to unpack the second
8169 // operand back into the 2 separate ones for bit offset and width.
8170 assert(OffsetWidthOp.isImm() &&
8171 "Scalar BFE is only implemented for constant width and offset");
8172 uint32_t Imm = OffsetWidthOp.getImm();
8173
8174 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8175 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8176 NewInstr.addImm(Offset);
8177 NewInstr.addImm(BitWidth);
8178 } else {
8179 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8180 AMDGPU::OpName::src1_modifiers) >= 0)
8181 NewInstr.addImm(0);
8182 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8183 NewInstr->addOperand(Inst.getOperand(2));
8184 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8185 AMDGPU::OpName::src2_modifiers) >= 0)
8186 NewInstr.addImm(0);
8187 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8188 NewInstr->addOperand(Inst.getOperand(3));
8189 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8190 NewInstr.addImm(0);
8191 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8192 NewInstr.addImm(0);
8193 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8194 NewInstr.addImm(0);
8195 }
8196 } else {
8197 // Just copy the SALU operands.
8198 for (const MachineOperand &Op : Inst.explicit_operands())
8199 NewInstr->addOperand(Op);
8200 }
8201
8202 // Remove any references to SCC. Vector instructions can't read from it, and
8203 // We're just about to add the implicit use / defs of VCC, and we don't want
8204 // both.
8205 for (MachineOperand &Op : Inst.implicit_operands()) {
8206 if (Op.getReg() == AMDGPU::SCC) {
8207 // Only propagate through live-def of SCC.
8208 if (Op.isDef() && !Op.isDead())
8209 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8210 if (Op.isUse())
8211 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8212 }
8213 }
8214 Inst.eraseFromParent();
8215 Register NewDstReg;
8216 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8217 Register DstReg = NewInstr->getOperand(0).getReg();
8218 assert(DstReg.isVirtual());
8219 // Update the destination register class.
8220 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8221 assert(NewDstRC);
8222 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8223 MRI.replaceRegWith(DstReg, NewDstReg);
8224 }
8225 fixImplicitOperands(*NewInstr);
8226
8227 legalizeOperandsVALUt16(*NewInstr, MRI);
8228
8229 // Legalize the operands
8230 legalizeOperands(*NewInstr, MDT);
8231 if (NewDstReg)
8232 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8233}
8234
8235// Add/sub require special handling to deal with carry outs.
8236std::pair<bool, MachineBasicBlock *>
8237SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8238 MachineDominatorTree *MDT) const {
8239 if (ST.hasAddNoCarry()) {
8240 // Assume there is no user of scc since we don't select this in that case.
8241 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8242 // is used.
8243
8244 MachineBasicBlock &MBB = *Inst.getParent();
8245 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8246
8247 Register OldDstReg = Inst.getOperand(0).getReg();
8248 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8249
8250 unsigned Opc = Inst.getOpcode();
8251 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8252
8253 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8254 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8255
8256 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8257 Inst.removeOperand(3);
8258
8259 Inst.setDesc(get(NewOpc));
8260 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8261 Inst.addImplicitDefUseOperands(*MBB.getParent());
8262 MRI.replaceRegWith(OldDstReg, ResultReg);
8263 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8264
8265 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8266 return std::pair(true, NewBB);
8267 }
8268
8269 return std::pair(false, nullptr);
8270}
8271
8272void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8273 MachineDominatorTree *MDT) const {
8274
8275 MachineBasicBlock &MBB = *Inst.getParent();
8276 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8277 MachineBasicBlock::iterator MII = Inst;
8278 DebugLoc DL = Inst.getDebugLoc();
8279
8280 MachineOperand &Dest = Inst.getOperand(0);
8281 MachineOperand &Src0 = Inst.getOperand(1);
8282 MachineOperand &Src1 = Inst.getOperand(2);
8283 MachineOperand &Cond = Inst.getOperand(3);
8284
8285 Register CondReg = Cond.getReg();
8286 bool IsSCC = (CondReg == AMDGPU::SCC);
8287
8288 // If this is a trivial select where the condition is effectively not SCC
8289 // (CondReg is a source of copy to SCC), then the select is semantically
8290 // equivalent to copying CondReg. Hence, there is no need to create
8291 // V_CNDMASK, we can just use that and bail out.
8292 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8293 (Src1.getImm() == 0)) {
8294 MRI.replaceRegWith(Dest.getReg(), CondReg);
8295 return;
8296 }
8297
8298 Register NewCondReg = CondReg;
8299 if (IsSCC) {
8300 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8301 NewCondReg = MRI.createVirtualRegister(TC);
8302
8303 // Now look for the closest SCC def if it is a copy
8304 // replacing the CondReg with the COPY source register
8305 bool CopyFound = false;
8306 for (MachineInstr &CandI :
8308 Inst.getParent()->rend())) {
8309 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8310 -1) {
8311 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8312 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8313 .addReg(CandI.getOperand(1).getReg());
8314 CopyFound = true;
8315 }
8316 break;
8317 }
8318 }
8319 if (!CopyFound) {
8320 // SCC def is not a copy
8321 // Insert a trivial select instead of creating a copy, because a copy from
8322 // SCC would semantically mean just copying a single bit, but we may need
8323 // the result to be a vector condition mask that needs preserving.
8324 unsigned Opcode =
8325 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8326 auto NewSelect =
8327 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8328 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8329 }
8330 }
8331
8332 Register NewDestReg = MRI.createVirtualRegister(
8333 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8334 MachineInstr *NewInst;
8335 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8336 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8337 .addImm(0)
8338 .add(Src1) // False
8339 .addImm(0)
8340 .add(Src0) // True
8341 .addReg(NewCondReg);
8342 } else {
8343 NewInst =
8344 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8345 .add(Src1) // False
8346 .add(Src0) // True
8347 .addReg(NewCondReg);
8348 }
8349 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8350 legalizeOperands(*NewInst, MDT);
8351 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8352}
8353
8354void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8355 MachineInstr &Inst) const {
8356 MachineBasicBlock &MBB = *Inst.getParent();
8357 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8358 MachineBasicBlock::iterator MII = Inst;
8359 DebugLoc DL = Inst.getDebugLoc();
8360
8361 MachineOperand &Dest = Inst.getOperand(0);
8362 MachineOperand &Src = Inst.getOperand(1);
8363 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8364 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8365
8366 unsigned SubOp = ST.hasAddNoCarry() ?
8367 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8368
8369 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8370 .addImm(0)
8371 .addReg(Src.getReg());
8372
8373 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8374 .addReg(Src.getReg())
8375 .addReg(TmpReg);
8376
8377 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8378 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8379}
8380
8381void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8382 MachineInstr &Inst) const {
8383 MachineBasicBlock &MBB = *Inst.getParent();
8384 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8385 MachineBasicBlock::iterator MII = Inst;
8386 const DebugLoc &DL = Inst.getDebugLoc();
8387
8388 MachineOperand &Dest = Inst.getOperand(0);
8389 MachineOperand &Src0 = Inst.getOperand(1);
8390 MachineOperand &Src1 = Inst.getOperand(2);
8391
8392 if (ST.hasDLInsts()) {
8393 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8394 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8395 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8396
8397 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8398 .add(Src0)
8399 .add(Src1);
8400
8401 MRI.replaceRegWith(Dest.getReg(), NewDest);
8402 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8403 } else {
8404 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8405 // invert either source and then perform the XOR. If either source is a
8406 // scalar register, then we can leave the inversion on the scalar unit to
8407 // achieve a better distribution of scalar and vector instructions.
8408 bool Src0IsSGPR = Src0.isReg() &&
8409 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8410 bool Src1IsSGPR = Src1.isReg() &&
8411 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8412 MachineInstr *Xor;
8413 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8414 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8415
8416 // Build a pair of scalar instructions and add them to the work list.
8417 // The next iteration over the work list will lower these to the vector
8418 // unit as necessary.
8419 if (Src0IsSGPR) {
8420 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8421 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8422 .addReg(Temp)
8423 .add(Src1);
8424 } else if (Src1IsSGPR) {
8425 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8426 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8427 .add(Src0)
8428 .addReg(Temp);
8429 } else {
8430 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8431 .add(Src0)
8432 .add(Src1);
8433 MachineInstr *Not =
8434 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8435 Worklist.insert(Not);
8436 }
8437
8438 MRI.replaceRegWith(Dest.getReg(), NewDest);
8439
8440 Worklist.insert(Xor);
8441
8442 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8443 }
8444}
8445
8446void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8447 MachineInstr &Inst,
8448 unsigned Opcode) const {
8449 MachineBasicBlock &MBB = *Inst.getParent();
8450 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8451 MachineBasicBlock::iterator MII = Inst;
8452 const DebugLoc &DL = Inst.getDebugLoc();
8453
8454 MachineOperand &Dest = Inst.getOperand(0);
8455 MachineOperand &Src0 = Inst.getOperand(1);
8456 MachineOperand &Src1 = Inst.getOperand(2);
8457
8458 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8459 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8460
8461 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8462 .add(Src0)
8463 .add(Src1);
8464
8465 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8466 .addReg(Interm);
8467
8468 Worklist.insert(&Op);
8469 Worklist.insert(&Not);
8470
8471 MRI.replaceRegWith(Dest.getReg(), NewDest);
8472 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8473}
8474
8475void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8476 MachineInstr &Inst,
8477 unsigned Opcode) const {
8478 MachineBasicBlock &MBB = *Inst.getParent();
8479 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8480 MachineBasicBlock::iterator MII = Inst;
8481 const DebugLoc &DL = Inst.getDebugLoc();
8482
8483 MachineOperand &Dest = Inst.getOperand(0);
8484 MachineOperand &Src0 = Inst.getOperand(1);
8485 MachineOperand &Src1 = Inst.getOperand(2);
8486
8487 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8488 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8489
8490 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8491 .add(Src1);
8492
8493 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8494 .add(Src0)
8495 .addReg(Interm);
8496
8497 Worklist.insert(&Not);
8498 Worklist.insert(&Op);
8499
8500 MRI.replaceRegWith(Dest.getReg(), NewDest);
8501 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8502}
8503
8504void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8505 MachineInstr &Inst, unsigned Opcode,
8506 bool Swap) const {
8507 MachineBasicBlock &MBB = *Inst.getParent();
8508 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8509
8510 MachineOperand &Dest = Inst.getOperand(0);
8511 MachineOperand &Src0 = Inst.getOperand(1);
8512 DebugLoc DL = Inst.getDebugLoc();
8513
8514 MachineBasicBlock::iterator MII = Inst;
8515
8516 const MCInstrDesc &InstDesc = get(Opcode);
8517 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8518 MRI.getRegClass(Src0.getReg()) :
8519 &AMDGPU::SGPR_32RegClass;
8520
8521 const TargetRegisterClass *Src0SubRC =
8522 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8523
8524 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8525 AMDGPU::sub0, Src0SubRC);
8526
8527 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8528 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8529 const TargetRegisterClass *NewDestSubRC =
8530 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8531
8532 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8533 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8534
8535 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8536 AMDGPU::sub1, Src0SubRC);
8537
8538 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8539 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8540
8541 if (Swap)
8542 std::swap(DestSub0, DestSub1);
8543
8544 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8545 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8546 .addReg(DestSub0)
8547 .addImm(AMDGPU::sub0)
8548 .addReg(DestSub1)
8549 .addImm(AMDGPU::sub1);
8550
8551 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8552
8553 Worklist.insert(&LoHalf);
8554 Worklist.insert(&HiHalf);
8555
8556 // We don't need to legalizeOperands here because for a single operand, src0
8557 // will support any kind of input.
8558
8559 // Move all users of this moved value.
8560 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8561}
8562
8563// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8564// split the s_mul_u64 in 32-bit vector multiplications.
8565void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8566 MachineInstr &Inst,
8567 MachineDominatorTree *MDT) const {
8568 MachineBasicBlock &MBB = *Inst.getParent();
8569 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8570
8571 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8572 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8573 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8574
8575 MachineOperand &Dest = Inst.getOperand(0);
8576 MachineOperand &Src0 = Inst.getOperand(1);
8577 MachineOperand &Src1 = Inst.getOperand(2);
8578 const DebugLoc &DL = Inst.getDebugLoc();
8579 MachineBasicBlock::iterator MII = Inst;
8580
8581 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8582 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8583 const TargetRegisterClass *Src0SubRC =
8584 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8585 if (RI.isSGPRClass(Src0SubRC))
8586 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8587 const TargetRegisterClass *Src1SubRC =
8588 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8589 if (RI.isSGPRClass(Src1SubRC))
8590 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8591
8592 // First, we extract the low 32-bit and high 32-bit values from each of the
8593 // operands.
8594 MachineOperand Op0L =
8595 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8596 MachineOperand Op1L =
8597 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8598 MachineOperand Op0H =
8599 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8600 MachineOperand Op1H =
8601 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8602
8603 // The multilication is done as follows:
8604 //
8605 // Op1H Op1L
8606 // * Op0H Op0L
8607 // --------------------
8608 // Op1H*Op0L Op1L*Op0L
8609 // + Op1H*Op0H Op1L*Op0H
8610 // -----------------------------------------
8611 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8612 //
8613 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8614 // value and that would overflow.
8615 // The low 32-bit value is Op1L*Op0L.
8616 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8617
8618 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8619 MachineInstr *Op1L_Op0H =
8620 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8621 .add(Op1L)
8622 .add(Op0H);
8623
8624 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8625 MachineInstr *Op1H_Op0L =
8626 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8627 .add(Op1H)
8628 .add(Op0L);
8629
8630 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8631 MachineInstr *Carry =
8632 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8633 .add(Op1L)
8634 .add(Op0L);
8635
8636 MachineInstr *LoHalf =
8637 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8638 .add(Op1L)
8639 .add(Op0L);
8640
8641 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8642 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8643 .addReg(Op1L_Op0H_Reg)
8644 .addReg(Op1H_Op0L_Reg);
8645
8646 MachineInstr *HiHalf =
8647 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8648 .addReg(AddReg)
8649 .addReg(CarryReg);
8650
8651 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8652 .addReg(DestSub0)
8653 .addImm(AMDGPU::sub0)
8654 .addReg(DestSub1)
8655 .addImm(AMDGPU::sub1);
8656
8657 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8658
8659 // Try to legalize the operands in case we need to swap the order to keep it
8660 // valid.
8661 legalizeOperands(*Op1L_Op0H, MDT);
8662 legalizeOperands(*Op1H_Op0L, MDT);
8663 legalizeOperands(*Carry, MDT);
8664 legalizeOperands(*LoHalf, MDT);
8665 legalizeOperands(*Add, MDT);
8666 legalizeOperands(*HiHalf, MDT);
8667
8668 // Move all users of this moved value.
8669 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8670}
8671
8672// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8673// multiplications.
8674void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8675 MachineInstr &Inst,
8676 MachineDominatorTree *MDT) const {
8677 MachineBasicBlock &MBB = *Inst.getParent();
8678 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8679
8680 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8681 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8682 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8683
8684 MachineOperand &Dest = Inst.getOperand(0);
8685 MachineOperand &Src0 = Inst.getOperand(1);
8686 MachineOperand &Src1 = Inst.getOperand(2);
8687 const DebugLoc &DL = Inst.getDebugLoc();
8688 MachineBasicBlock::iterator MII = Inst;
8689
8690 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8691 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8692 const TargetRegisterClass *Src0SubRC =
8693 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8694 if (RI.isSGPRClass(Src0SubRC))
8695 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8696 const TargetRegisterClass *Src1SubRC =
8697 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8698 if (RI.isSGPRClass(Src1SubRC))
8699 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8700
8701 // First, we extract the low 32-bit and high 32-bit values from each of the
8702 // operands.
8703 MachineOperand Op0L =
8704 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8705 MachineOperand Op1L =
8706 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8707
8708 unsigned Opc = Inst.getOpcode();
8709 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8710 ? AMDGPU::V_MUL_HI_U32_e64
8711 : AMDGPU::V_MUL_HI_I32_e64;
8712 MachineInstr *HiHalf =
8713 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8714
8715 MachineInstr *LoHalf =
8716 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8717 .add(Op1L)
8718 .add(Op0L);
8719
8720 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8721 .addReg(DestSub0)
8722 .addImm(AMDGPU::sub0)
8723 .addReg(DestSub1)
8724 .addImm(AMDGPU::sub1);
8725
8726 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8727
8728 // Try to legalize the operands in case we need to swap the order to keep it
8729 // valid.
8730 legalizeOperands(*HiHalf, MDT);
8731 legalizeOperands(*LoHalf, MDT);
8732
8733 // Move all users of this moved value.
8734 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8735}
8736
8737void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8738 MachineInstr &Inst, unsigned Opcode,
8739 MachineDominatorTree *MDT) const {
8740 MachineBasicBlock &MBB = *Inst.getParent();
8741 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8742
8743 MachineOperand &Dest = Inst.getOperand(0);
8744 MachineOperand &Src0 = Inst.getOperand(1);
8745 MachineOperand &Src1 = Inst.getOperand(2);
8746 DebugLoc DL = Inst.getDebugLoc();
8747
8748 MachineBasicBlock::iterator MII = Inst;
8749
8750 const MCInstrDesc &InstDesc = get(Opcode);
8751 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8752 MRI.getRegClass(Src0.getReg()) :
8753 &AMDGPU::SGPR_32RegClass;
8754
8755 const TargetRegisterClass *Src0SubRC =
8756 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8757 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8758 MRI.getRegClass(Src1.getReg()) :
8759 &AMDGPU::SGPR_32RegClass;
8760
8761 const TargetRegisterClass *Src1SubRC =
8762 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8763
8764 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8765 AMDGPU::sub0, Src0SubRC);
8766 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8767 AMDGPU::sub0, Src1SubRC);
8768 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8769 AMDGPU::sub1, Src0SubRC);
8770 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8771 AMDGPU::sub1, Src1SubRC);
8772
8773 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8774 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8775 const TargetRegisterClass *NewDestSubRC =
8776 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8777
8778 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8779 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8780 .add(SrcReg0Sub0)
8781 .add(SrcReg1Sub0);
8782
8783 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8784 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8785 .add(SrcReg0Sub1)
8786 .add(SrcReg1Sub1);
8787
8788 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8789 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8790 .addReg(DestSub0)
8791 .addImm(AMDGPU::sub0)
8792 .addReg(DestSub1)
8793 .addImm(AMDGPU::sub1);
8794
8795 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8796
8797 Worklist.insert(&LoHalf);
8798 Worklist.insert(&HiHalf);
8799
8800 // Move all users of this moved value.
8801 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8802}
8803
8804void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8805 MachineInstr &Inst,
8806 MachineDominatorTree *MDT) const {
8807 MachineBasicBlock &MBB = *Inst.getParent();
8808 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8809
8810 MachineOperand &Dest = Inst.getOperand(0);
8811 MachineOperand &Src0 = Inst.getOperand(1);
8812 MachineOperand &Src1 = Inst.getOperand(2);
8813 const DebugLoc &DL = Inst.getDebugLoc();
8814
8815 MachineBasicBlock::iterator MII = Inst;
8816
8817 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8818
8819 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8820
8821 MachineOperand* Op0;
8822 MachineOperand* Op1;
8823
8824 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8825 Op0 = &Src0;
8826 Op1 = &Src1;
8827 } else {
8828 Op0 = &Src1;
8829 Op1 = &Src0;
8830 }
8831
8832 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8833 .add(*Op0);
8834
8835 Register NewDest = MRI.createVirtualRegister(DestRC);
8836
8837 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8838 .addReg(Interm)
8839 .add(*Op1);
8840
8841 MRI.replaceRegWith(Dest.getReg(), NewDest);
8842
8843 Worklist.insert(&Xor);
8844}
8845
8846void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8847 MachineInstr &Inst) const {
8848 MachineBasicBlock &MBB = *Inst.getParent();
8849 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8850
8851 MachineBasicBlock::iterator MII = Inst;
8852 const DebugLoc &DL = Inst.getDebugLoc();
8853
8854 MachineOperand &Dest = Inst.getOperand(0);
8855 MachineOperand &Src = Inst.getOperand(1);
8856
8857 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8858 const TargetRegisterClass *SrcRC = Src.isReg() ?
8859 MRI.getRegClass(Src.getReg()) :
8860 &AMDGPU::SGPR_32RegClass;
8861
8862 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8863 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8864
8865 const TargetRegisterClass *SrcSubRC =
8866 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8867
8868 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8869 AMDGPU::sub0, SrcSubRC);
8870 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8871 AMDGPU::sub1, SrcSubRC);
8872
8873 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8874
8875 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8876
8877 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8878
8879 // We don't need to legalize operands here. src0 for either instruction can be
8880 // an SGPR, and the second input is unused or determined here.
8881 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8882}
8883
8884void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8885 MachineInstr &Inst) const {
8886 MachineBasicBlock &MBB = *Inst.getParent();
8887 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8888 MachineBasicBlock::iterator MII = Inst;
8889 const DebugLoc &DL = Inst.getDebugLoc();
8890
8891 MachineOperand &Dest = Inst.getOperand(0);
8892 uint32_t Imm = Inst.getOperand(2).getImm();
8893 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8894 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8895
8896 (void) Offset;
8897
8898 // Only sext_inreg cases handled.
8899 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8900 Offset == 0 && "Not implemented");
8901
8902 if (BitWidth < 32) {
8903 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8904 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8905 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8906
8907 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8908 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8909 .addImm(0)
8910 .addImm(BitWidth);
8911
8912 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8913 .addImm(31)
8914 .addReg(MidRegLo);
8915
8916 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8917 .addReg(MidRegLo)
8918 .addImm(AMDGPU::sub0)
8919 .addReg(MidRegHi)
8920 .addImm(AMDGPU::sub1);
8921
8922 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8923 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8924 return;
8925 }
8926
8927 MachineOperand &Src = Inst.getOperand(1);
8928 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8929 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8930
8931 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8932 .addImm(31)
8933 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8934
8935 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8936 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8937 .addImm(AMDGPU::sub0)
8938 .addReg(TmpReg)
8939 .addImm(AMDGPU::sub1);
8940
8941 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8942 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8943}
8944
8945void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8946 MachineInstr &Inst, unsigned Opcode,
8947 MachineDominatorTree *MDT) const {
8948 // (S_FLBIT_I32_B64 hi:lo) ->
8949 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8950 // (S_FF1_I32_B64 hi:lo) ->
8951 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8952
8953 MachineBasicBlock &MBB = *Inst.getParent();
8954 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8955 MachineBasicBlock::iterator MII = Inst;
8956 const DebugLoc &DL = Inst.getDebugLoc();
8957
8958 MachineOperand &Dest = Inst.getOperand(0);
8959 MachineOperand &Src = Inst.getOperand(1);
8960
8961 const MCInstrDesc &InstDesc = get(Opcode);
8962
8963 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8964 unsigned OpcodeAdd =
8965 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8966
8967 const TargetRegisterClass *SrcRC =
8968 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8969 const TargetRegisterClass *SrcSubRC =
8970 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8971
8972 MachineOperand SrcRegSub0 =
8973 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8974 MachineOperand SrcRegSub1 =
8975 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8976
8977 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8978 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8979 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8980 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8981
8982 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8983
8984 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8985
8986 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8987 .addReg(IsCtlz ? MidReg1 : MidReg2)
8988 .addImm(32)
8989 .addImm(1); // enable clamp
8990
8991 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8992 .addReg(MidReg3)
8993 .addReg(IsCtlz ? MidReg2 : MidReg1);
8994
8995 MRI.replaceRegWith(Dest.getReg(), MidReg4);
8996
8997 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8998}
8999
9000void SIInstrInfo::addUsersToMoveToVALUWorklist(
9002 SIInstrWorklist &Worklist) const {
9003 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9004 MachineInstr &UseMI = *MO.getParent();
9005
9006 unsigned OpNo = 0;
9007
9008 switch (UseMI.getOpcode()) {
9009 case AMDGPU::COPY:
9010 case AMDGPU::WQM:
9011 case AMDGPU::SOFT_WQM:
9012 case AMDGPU::STRICT_WWM:
9013 case AMDGPU::STRICT_WQM:
9014 case AMDGPU::REG_SEQUENCE:
9015 case AMDGPU::PHI:
9016 case AMDGPU::INSERT_SUBREG:
9017 break;
9018 default:
9019 OpNo = MO.getOperandNo();
9020 break;
9021 }
9022
9023 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo)))
9024 Worklist.insert(&UseMI);
9025 else
9026 // Legalization could change user list.
9028 }
9029}
9030
9031void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9033 MachineInstr &Inst) const {
9034 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9035 MachineBasicBlock *MBB = Inst.getParent();
9036 MachineOperand &Src0 = Inst.getOperand(1);
9037 MachineOperand &Src1 = Inst.getOperand(2);
9038 const DebugLoc &DL = Inst.getDebugLoc();
9039
9040 switch (Inst.getOpcode()) {
9041 case AMDGPU::S_PACK_LL_B32_B16: {
9042 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9043 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9044
9045 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9046 // 0.
9047 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9048 .addImm(0xffff);
9049
9050 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9051 .addReg(ImmReg, RegState::Kill)
9052 .add(Src0);
9053
9054 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9055 .add(Src1)
9056 .addImm(16)
9057 .addReg(TmpReg, RegState::Kill);
9058 break;
9059 }
9060 case AMDGPU::S_PACK_LH_B32_B16: {
9061 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9062 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9063 .addImm(0xffff);
9064 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9065 .addReg(ImmReg, RegState::Kill)
9066 .add(Src0)
9067 .add(Src1);
9068 break;
9069 }
9070 case AMDGPU::S_PACK_HL_B32_B16: {
9071 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9072 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9073 .addImm(16)
9074 .add(Src0);
9075 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9076 .add(Src1)
9077 .addImm(16)
9078 .addReg(TmpReg, RegState::Kill);
9079 break;
9080 }
9081 case AMDGPU::S_PACK_HH_B32_B16: {
9082 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9083 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9084 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9085 .addImm(16)
9086 .add(Src0);
9087 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9088 .addImm(0xffff0000);
9089 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9090 .add(Src1)
9091 .addReg(ImmReg, RegState::Kill)
9092 .addReg(TmpReg, RegState::Kill);
9093 break;
9094 }
9095 default:
9096 llvm_unreachable("unhandled s_pack_* instruction");
9097 }
9098
9099 MachineOperand &Dest = Inst.getOperand(0);
9100 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9101 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9102}
9103
9104void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
9105 MachineInstr &SCCDefInst,
9106 SIInstrWorklist &Worklist,
9107 Register NewCond) const {
9108
9109 // Ensure that def inst defines SCC, which is still live.
9110 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9111 !Op.isDead() && Op.getParent() == &SCCDefInst);
9112 SmallVector<MachineInstr *, 4> CopyToDelete;
9113 // This assumes that all the users of SCC are in the same block
9114 // as the SCC def.
9115 for (MachineInstr &MI : // Skip the def inst itself.
9116 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9117 SCCDefInst.getParent()->end())) {
9118 // Check if SCC is used first.
9119 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9120 if (SCCIdx != -1) {
9121 if (MI.isCopy()) {
9122 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9123 Register DestReg = MI.getOperand(0).getReg();
9124
9125 MRI.replaceRegWith(DestReg, NewCond);
9126 CopyToDelete.push_back(&MI);
9127 } else {
9128
9129 if (NewCond.isValid())
9130 MI.getOperand(SCCIdx).setReg(NewCond);
9131
9132 Worklist.insert(&MI);
9133 }
9134 }
9135 // Exit if we find another SCC def.
9136 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9137 break;
9138 }
9139 for (auto &Copy : CopyToDelete)
9140 Copy->eraseFromParent();
9141}
9142
9143// Instructions that use SCC may be converted to VALU instructions. When that
9144// happens, the SCC register is changed to VCC_LO. The instruction that defines
9145// SCC must be changed to an instruction that defines VCC. This function makes
9146// sure that the instruction that defines SCC is added to the moveToVALU
9147// worklist.
9148void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9149 SIInstrWorklist &Worklist) const {
9150 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9151 // then there is nothing to do because the defining instruction has been
9152 // converted to a VALU already. If SCC then that instruction needs to be
9153 // converted to a VALU.
9154 for (MachineInstr &MI :
9155 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9156 SCCUseInst->getParent()->rend())) {
9157 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9158 break;
9159 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9160 Worklist.insert(&MI);
9161 break;
9162 }
9163 }
9164}
9165
9166const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9167 const MachineInstr &Inst) const {
9168 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9169
9170 switch (Inst.getOpcode()) {
9171 // For target instructions, getOpRegClass just returns the virtual register
9172 // class associated with the operand, so we need to find an equivalent VGPR
9173 // register class in order to move the instruction to the VALU.
9174 case AMDGPU::COPY:
9175 case AMDGPU::PHI:
9176 case AMDGPU::REG_SEQUENCE:
9177 case AMDGPU::INSERT_SUBREG:
9178 case AMDGPU::WQM:
9179 case AMDGPU::SOFT_WQM:
9180 case AMDGPU::STRICT_WWM:
9181 case AMDGPU::STRICT_WQM: {
9182 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9183 if (RI.isAGPRClass(SrcRC)) {
9184 if (RI.isAGPRClass(NewDstRC))
9185 return nullptr;
9186
9187 switch (Inst.getOpcode()) {
9188 case AMDGPU::PHI:
9189 case AMDGPU::REG_SEQUENCE:
9190 case AMDGPU::INSERT_SUBREG:
9191 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9192 break;
9193 default:
9194 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9195 }
9196
9197 if (!NewDstRC)
9198 return nullptr;
9199 } else {
9200 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9201 return nullptr;
9202
9203 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9204 if (!NewDstRC)
9205 return nullptr;
9206 }
9207
9208 return NewDstRC;
9209 }
9210 default:
9211 return NewDstRC;
9212 }
9213}
9214
9215// Find the one SGPR operand we are allowed to use.
9216Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9217 int OpIndices[3]) const {
9218 const MCInstrDesc &Desc = MI.getDesc();
9219
9220 // Find the one SGPR operand we are allowed to use.
9221 //
9222 // First we need to consider the instruction's operand requirements before
9223 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9224 // of VCC, but we are still bound by the constant bus requirement to only use
9225 // one.
9226 //
9227 // If the operand's class is an SGPR, we can never move it.
9228
9229 Register SGPRReg = findImplicitSGPRRead(MI);
9230 if (SGPRReg)
9231 return SGPRReg;
9232
9233 Register UsedSGPRs[3] = {Register()};
9234 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9235
9236 for (unsigned i = 0; i < 3; ++i) {
9237 int Idx = OpIndices[i];
9238 if (Idx == -1)
9239 break;
9240
9241 const MachineOperand &MO = MI.getOperand(Idx);
9242 if (!MO.isReg())
9243 continue;
9244
9245 // Is this operand statically required to be an SGPR based on the operand
9246 // constraints?
9247 const TargetRegisterClass *OpRC =
9248 RI.getRegClass(Desc.operands()[Idx].RegClass);
9249 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9250 if (IsRequiredSGPR)
9251 return MO.getReg();
9252
9253 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9254 Register Reg = MO.getReg();
9255 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9256 if (RI.isSGPRClass(RegRC))
9257 UsedSGPRs[i] = Reg;
9258 }
9259
9260 // We don't have a required SGPR operand, so we have a bit more freedom in
9261 // selecting operands to move.
9262
9263 // Try to select the most used SGPR. If an SGPR is equal to one of the
9264 // others, we choose that.
9265 //
9266 // e.g.
9267 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9268 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9269
9270 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9271 // prefer those.
9272
9273 if (UsedSGPRs[0]) {
9274 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9275 SGPRReg = UsedSGPRs[0];
9276 }
9277
9278 if (!SGPRReg && UsedSGPRs[1]) {
9279 if (UsedSGPRs[1] == UsedSGPRs[2])
9280 SGPRReg = UsedSGPRs[1];
9281 }
9282
9283 return SGPRReg;
9284}
9285
9287 AMDGPU::OpName OperandName) const {
9288 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9289 return nullptr;
9290
9291 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9292 if (Idx == -1)
9293 return nullptr;
9294
9295 return &MI.getOperand(Idx);
9296}
9297
9299 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9300 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9303 return (Format << 44) |
9304 (1ULL << 56) | // RESOURCE_LEVEL = 1
9305 (3ULL << 60); // OOB_SELECT = 3
9306 }
9307
9308 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9309 if (ST.isAmdHsaOS()) {
9310 // Set ATC = 1. GFX9 doesn't have this bit.
9311 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9312 RsrcDataFormat |= (1ULL << 56);
9313
9314 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9315 // BTW, it disables TC L2 and therefore decreases performance.
9316 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9317 RsrcDataFormat |= (2ULL << 59);
9318 }
9319
9320 return RsrcDataFormat;
9321}
9322
9326 0xffffffff; // Size;
9327
9328 // GFX9 doesn't have ELEMENT_SIZE.
9329 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9330 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9331 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9332 }
9333
9334 // IndexStride = 64 / 32.
9335 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9336 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9337
9338 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9339 // Clear them unless we want a huge stride.
9340 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9341 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9342 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9343
9344 return Rsrc23;
9345}
9346
9348 unsigned Opc = MI.getOpcode();
9349
9350 return isSMRD(Opc);
9351}
9352
9354 return get(Opc).mayLoad() &&
9355 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9356}
9357
9359 int &FrameIndex) const {
9360 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9361 if (!Addr || !Addr->isFI())
9362 return Register();
9363
9364 assert(!MI.memoperands_empty() &&
9365 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9366
9367 FrameIndex = Addr->getIndex();
9368 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9369}
9370
9372 int &FrameIndex) const {
9373 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9374 assert(Addr && Addr->isFI());
9375 FrameIndex = Addr->getIndex();
9376 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9377}
9378
9380 int &FrameIndex) const {
9381 if (!MI.mayLoad())
9382 return Register();
9383
9384 if (isMUBUF(MI) || isVGPRSpill(MI))
9385 return isStackAccess(MI, FrameIndex);
9386
9387 if (isSGPRSpill(MI))
9388 return isSGPRStackAccess(MI, FrameIndex);
9389
9390 return Register();
9391}
9392
9394 int &FrameIndex) const {
9395 if (!MI.mayStore())
9396 return Register();
9397
9398 if (isMUBUF(MI) || isVGPRSpill(MI))
9399 return isStackAccess(MI, FrameIndex);
9400
9401 if (isSGPRSpill(MI))
9402 return isSGPRStackAccess(MI, FrameIndex);
9403
9404 return Register();
9405}
9406
9408 unsigned Size = 0;
9410 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9411 while (++I != E && I->isInsideBundle()) {
9412 assert(!I->isBundle() && "No nested bundle!");
9414 }
9415
9416 return Size;
9417}
9418
9420 unsigned Opc = MI.getOpcode();
9422 unsigned DescSize = Desc.getSize();
9423
9424 // If we have a definitive size, we can use it. Otherwise we need to inspect
9425 // the operands to know the size.
9426 if (isFixedSize(MI)) {
9427 unsigned Size = DescSize;
9428
9429 // If we hit the buggy offset, an extra nop will be inserted in MC so
9430 // estimate the worst case.
9431 if (MI.isBranch() && ST.hasOffset3fBug())
9432 Size += 4;
9433
9434 return Size;
9435 }
9436
9437 // Instructions may have a 32-bit literal encoded after them. Check
9438 // operands that could ever be literals.
9439 if (isVALU(MI) || isSALU(MI)) {
9440 if (isDPP(MI))
9441 return DescSize;
9442 bool HasLiteral = false;
9443 unsigned LiteralSize = 4;
9444 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9445 const MachineOperand &Op = MI.getOperand(I);
9446 const MCOperandInfo &OpInfo = Desc.operands()[I];
9447 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9448 HasLiteral = true;
9449 if (ST.has64BitLiterals()) {
9450 switch (OpInfo.OperandType) {
9451 default:
9452 break;
9454 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9455 LiteralSize = 8;
9456 break;
9458 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9459 LiteralSize = 8;
9460 break;
9461 }
9462 }
9463 break;
9464 }
9465 }
9466 return HasLiteral ? DescSize + LiteralSize : DescSize;
9467 }
9468
9469 // Check whether we have extra NSA words.
9470 if (isMIMG(MI)) {
9471 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9472 if (VAddr0Idx < 0)
9473 return 8;
9474
9475 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9476 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9477 }
9478
9479 switch (Opc) {
9480 case TargetOpcode::BUNDLE:
9481 return getInstBundleSize(MI);
9482 case TargetOpcode::INLINEASM:
9483 case TargetOpcode::INLINEASM_BR: {
9484 const MachineFunction *MF = MI.getParent()->getParent();
9485 const char *AsmStr = MI.getOperand(0).getSymbolName();
9486 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9487 }
9488 default:
9489 if (MI.isMetaInstruction())
9490 return 0;
9491
9492 // If D16 Pseudo inst, get correct MC code size
9493 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9494 if (D16Info) {
9495 // Assume d16_lo/hi inst are always in same size
9496 unsigned LoInstOpcode = D16Info->LoOp;
9497 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9498 DescSize = Desc.getSize();
9499 }
9500
9501 return DescSize;
9502 }
9503}
9504
9506 if (!isFLAT(MI))
9507 return false;
9508
9509 if (MI.memoperands_empty())
9510 return true;
9511
9512 for (const MachineMemOperand *MMO : MI.memoperands()) {
9513 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9514 return true;
9515 }
9516 return false;
9517}
9518
9521 static const std::pair<int, const char *> TargetIndices[] = {
9522 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9523 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9524 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9525 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9526 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9527 return ArrayRef(TargetIndices);
9528}
9529
9530/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9531/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9537
9538/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9539/// pass.
9544
9545// Called during:
9546// - pre-RA scheduling and post-RA scheduling
9549 const ScheduleDAGMI *DAG) const {
9550 // Borrowed from Arm Target
9551 // We would like to restrict this hazard recognizer to only
9552 // post-RA scheduling; we can tell that we're post-RA because we don't
9553 // track VRegLiveness.
9554 if (!DAG->hasVRegLiveness())
9555 return new GCNHazardRecognizer(DAG->MF);
9557}
9558
9559std::pair<unsigned, unsigned>
9561 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9562}
9563
9566 static const std::pair<unsigned, const char *> TargetFlags[] = {
9567 {MO_GOTPCREL, "amdgpu-gotprel"},
9568 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9569 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9570 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9571 {MO_REL32_LO, "amdgpu-rel32-lo"},
9572 {MO_REL32_HI, "amdgpu-rel32-hi"},
9573 {MO_REL64, "amdgpu-rel64"},
9574 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9575 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9576 {MO_ABS64, "amdgpu-abs64"},
9577 };
9578
9579 return ArrayRef(TargetFlags);
9580}
9581
9584 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9585 {
9586 {MONoClobber, "amdgpu-noclobber"},
9587 {MOLastUse, "amdgpu-last-use"},
9588 {MOCooperative, "amdgpu-cooperative"},
9589 };
9590
9591 return ArrayRef(TargetFlags);
9592}
9593
9595 const MachineFunction &MF) const {
9597 assert(SrcReg.isVirtual());
9598 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9599 return AMDGPU::WWM_COPY;
9600
9601 return AMDGPU::COPY;
9602}
9603
9605 Register Reg) const {
9606 // We need to handle instructions which may be inserted during register
9607 // allocation to handle the prolog. The initial prolog instruction may have
9608 // been separated from the start of the block by spills and copies inserted
9609 // needed by the prolog. However, the insertions for scalar registers can
9610 // always be placed at the BB top as they are independent of the exec mask
9611 // value.
9612 const MachineFunction *MF = MI.getParent()->getParent();
9613 bool IsNullOrVectorRegister = true;
9614 if (Reg) {
9615 const MachineRegisterInfo &MRI = MF->getRegInfo();
9616 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9617 }
9618
9619 uint16_t Opcode = MI.getOpcode();
9621 return IsNullOrVectorRegister &&
9622 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9623 (Opcode == AMDGPU::IMPLICIT_DEF &&
9624 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
9625 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9626 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9627}
9628
9632 const DebugLoc &DL,
9633 Register DestReg) const {
9634 if (ST.hasAddNoCarry())
9635 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9636
9637 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9638 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9639 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9640
9641 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9642 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9643}
9644
9647 const DebugLoc &DL,
9648 Register DestReg,
9649 RegScavenger &RS) const {
9650 if (ST.hasAddNoCarry())
9651 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9652
9653 // If available, prefer to use vcc.
9654 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9655 ? Register(RI.getVCC())
9657 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9658 0, /* AllowSpill */ false);
9659
9660 // TODO: Users need to deal with this.
9661 if (!UnusedCarry.isValid())
9662 return MachineInstrBuilder();
9663
9664 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9665 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9666}
9667
9668bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9669 switch (Opcode) {
9670 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9671 case AMDGPU::SI_KILL_I1_TERMINATOR:
9672 return true;
9673 default:
9674 return false;
9675 }
9676}
9677
9679 switch (Opcode) {
9680 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9681 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9682 case AMDGPU::SI_KILL_I1_PSEUDO:
9683 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9684 default:
9685 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9686 }
9687}
9688
9689bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9690 return Imm <= getMaxMUBUFImmOffset(ST);
9691}
9692
9694 // GFX12 field is non-negative 24-bit signed byte offset.
9695 const unsigned OffsetBits =
9696 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9697 return (1 << OffsetBits) - 1;
9698}
9699
9701 if (!ST.isWave32())
9702 return;
9703
9704 if (MI.isInlineAsm())
9705 return;
9706
9707 for (auto &Op : MI.implicit_operands()) {
9708 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9709 Op.setReg(AMDGPU::VCC_LO);
9710 }
9711}
9712
9714 if (!isSMRD(MI))
9715 return false;
9716
9717 // Check that it is using a buffer resource.
9718 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9719 if (Idx == -1) // e.g. s_memtime
9720 return false;
9721
9722 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9723 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9724}
9725
9726// Given Imm, split it into the values to put into the SOffset and ImmOffset
9727// fields in an MUBUF instruction. Return false if it is not possible (due to a
9728// hardware bug needing a workaround).
9729//
9730// The required alignment ensures that individual address components remain
9731// aligned if they are aligned to begin with. It also ensures that additional
9732// offsets within the given alignment can be added to the resulting ImmOffset.
9734 uint32_t &ImmOffset, Align Alignment) const {
9735 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9736 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9737 uint32_t Overflow = 0;
9738
9739 if (Imm > MaxImm) {
9740 if (Imm <= MaxImm + 64) {
9741 // Use an SOffset inline constant for 4..64
9742 Overflow = Imm - MaxImm;
9743 Imm = MaxImm;
9744 } else {
9745 // Try to keep the same value in SOffset for adjacent loads, so that
9746 // the corresponding register contents can be re-used.
9747 //
9748 // Load values with all low-bits (except for alignment bits) set into
9749 // SOffset, so that a larger range of values can be covered using
9750 // s_movk_i32.
9751 //
9752 // Atomic operations fail to work correctly when individual address
9753 // components are unaligned, even if their sum is aligned.
9754 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9755 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9756 Imm = Low;
9757 Overflow = High - Alignment.value();
9758 }
9759 }
9760
9761 if (Overflow > 0) {
9762 // There is a hardware bug in SI and CI which prevents address clamping in
9763 // MUBUF instructions from working correctly with SOffsets. The immediate
9764 // offset is unaffected.
9765 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
9766 return false;
9767
9768 // It is not possible to set immediate in SOffset field on some targets.
9769 if (ST.hasRestrictedSOffset())
9770 return false;
9771 }
9772
9773 ImmOffset = Imm;
9774 SOffset = Overflow;
9775 return true;
9776}
9777
9778// Depending on the used address space and instructions, some immediate offsets
9779// are allowed and some are not.
9780// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9781// scratch instruction offsets can also be negative. On GFX12, offsets can be
9782// negative for all variants.
9783//
9784// There are several bugs related to these offsets:
9785// On gfx10.1, flat instructions that go into the global address space cannot
9786// use an offset.
9787//
9788// For scratch instructions, the address can be either an SGPR or a VGPR.
9789// The following offsets can be used, depending on the architecture (x means
9790// cannot be used):
9791// +----------------------------+------+------+
9792// | Address-Mode | SGPR | VGPR |
9793// +----------------------------+------+------+
9794// | gfx9 | | |
9795// | negative, 4-aligned offset | x | ok |
9796// | negative, unaligned offset | x | ok |
9797// +----------------------------+------+------+
9798// | gfx10 | | |
9799// | negative, 4-aligned offset | ok | ok |
9800// | negative, unaligned offset | ok | x |
9801// +----------------------------+------+------+
9802// | gfx10.3 | | |
9803// | negative, 4-aligned offset | ok | ok |
9804// | negative, unaligned offset | ok | ok |
9805// +----------------------------+------+------+
9806//
9807// This function ignores the addressing mode, so if an offset cannot be used in
9808// one addressing mode, it is considered illegal.
9809bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9810 uint64_t FlatVariant) const {
9811 // TODO: Should 0 be special cased?
9812 if (!ST.hasFlatInstOffsets())
9813 return false;
9814
9815 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9816 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9817 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9818 return false;
9819
9820 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9821 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9822 (Offset % 4) != 0) {
9823 return false;
9824 }
9825
9826 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9827 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9828 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9829}
9830
9831// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9832std::pair<int64_t, int64_t>
9833SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9834 uint64_t FlatVariant) const {
9835 int64_t RemainderOffset = COffsetVal;
9836 int64_t ImmField = 0;
9837
9838 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9839 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9840
9841 if (AllowNegative) {
9842 // Use signed division by a power of two to truncate towards 0.
9843 int64_t D = 1LL << NumBits;
9844 RemainderOffset = (COffsetVal / D) * D;
9845 ImmField = COffsetVal - RemainderOffset;
9846
9847 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9848 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9849 (ImmField % 4) != 0) {
9850 // Make ImmField a multiple of 4
9851 RemainderOffset += ImmField % 4;
9852 ImmField -= ImmField % 4;
9853 }
9854 } else if (COffsetVal >= 0) {
9855 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9856 RemainderOffset = COffsetVal - ImmField;
9857 }
9858
9859 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9860 assert(RemainderOffset + ImmField == COffsetVal);
9861 return {ImmField, RemainderOffset};
9862}
9863
9865 if (ST.hasNegativeScratchOffsetBug() &&
9866 FlatVariant == SIInstrFlags::FlatScratch)
9867 return false;
9868
9869 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9870}
9871
9872static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9873 switch (ST.getGeneration()) {
9874 default:
9875 break;
9878 return SIEncodingFamily::SI;
9881 return SIEncodingFamily::VI;
9887 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
9889 }
9890 llvm_unreachable("Unknown subtarget generation!");
9891}
9892
9893bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9894 switch(MCOp) {
9895 // These opcodes use indirect register addressing so
9896 // they need special handling by codegen (currently missing).
9897 // Therefore it is too risky to allow these opcodes
9898 // to be selected by dpp combiner or sdwa peepholer.
9899 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9900 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9901 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9902 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9903 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9904 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9905 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9906 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9907 return true;
9908 default:
9909 return false;
9910 }
9911}
9912
9913#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
9914 case OPCODE##_dpp: \
9915 case OPCODE##_e32: \
9916 case OPCODE##_e64: \
9917 case OPCODE##_e64_dpp: \
9918 case OPCODE##_sdwa:
9919
9920static bool isRenamedInGFX9(int Opcode) {
9921 switch (Opcode) {
9922 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
9923 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
9924 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
9925 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
9926 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
9927 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
9928 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
9929 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
9930 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
9931 //
9932 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
9933 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
9934 case AMDGPU::V_FMA_F16_gfx9_e64:
9935 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
9936 case AMDGPU::V_INTERP_P2_F16:
9937 case AMDGPU::V_MAD_F16_e64:
9938 case AMDGPU::V_MAD_U16_e64:
9939 case AMDGPU::V_MAD_I16_e64:
9940 return true;
9941 default:
9942 return false;
9943 }
9944}
9945
9946int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9947 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9948
9949 unsigned Gen = subtargetEncodingFamily(ST);
9950
9951 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
9953
9954 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9955 // subtarget has UnpackedD16VMem feature.
9956 // TODO: remove this when we discard GFX80 encoding.
9957 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9959
9960 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9961 switch (ST.getGeneration()) {
9962 default:
9964 break;
9967 break;
9970 break;
9971 }
9972 }
9973
9974 if (isMAI(Opcode)) {
9975 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9976 if (MFMAOp != -1)
9977 Opcode = MFMAOp;
9978 }
9979
9980 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9981
9982 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
9984
9985 // -1 means that Opcode is already a native instruction.
9986 if (MCOp == -1)
9987 return Opcode;
9988
9989 if (ST.hasGFX90AInsts()) {
9990 uint16_t NMCOp = (uint16_t)-1;
9991 if (ST.hasGFX940Insts())
9993 if (NMCOp == (uint16_t)-1)
9995 if (NMCOp == (uint16_t)-1)
9997 if (NMCOp != (uint16_t)-1)
9998 MCOp = NMCOp;
9999 }
10000
10001 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
10002 // no encoding in the given subtarget generation.
10003 if (MCOp == (uint16_t)-1)
10004 return -1;
10005
10006 if (isAsmOnlyOpcode(MCOp))
10007 return -1;
10008
10009 return MCOp;
10010}
10011
10012static
10014 assert(RegOpnd.isReg());
10015 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10016 getRegSubRegPair(RegOpnd);
10017}
10018
10021 assert(MI.isRegSequence());
10022 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10023 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10024 auto &RegOp = MI.getOperand(1 + 2 * I);
10025 return getRegOrUndef(RegOp);
10026 }
10028}
10029
10030// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10031// Following a subreg of reg:subreg isn't supported
10034 if (!RSR.SubReg)
10035 return false;
10036 switch (MI.getOpcode()) {
10037 default: break;
10038 case AMDGPU::REG_SEQUENCE:
10039 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10040 return true;
10041 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10042 case AMDGPU::INSERT_SUBREG:
10043 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10044 // inserted the subreg we're looking for
10045 RSR = getRegOrUndef(MI.getOperand(2));
10046 else { // the subreg in the rest of the reg
10047 auto R1 = getRegOrUndef(MI.getOperand(1));
10048 if (R1.SubReg) // subreg of subreg isn't supported
10049 return false;
10050 RSR.Reg = R1.Reg;
10051 }
10052 return true;
10053 }
10054 return false;
10055}
10056
10059 assert(MRI.isSSA());
10060 if (!P.Reg.isVirtual())
10061 return nullptr;
10062
10063 auto RSR = P;
10064 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10065 while (auto *MI = DefInst) {
10066 DefInst = nullptr;
10067 switch (MI->getOpcode()) {
10068 case AMDGPU::COPY:
10069 case AMDGPU::V_MOV_B32_e32: {
10070 auto &Op1 = MI->getOperand(1);
10071 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10072 if (Op1.isUndef())
10073 return nullptr;
10074 RSR = getRegSubRegPair(Op1);
10075 DefInst = MRI.getVRegDef(RSR.Reg);
10076 }
10077 break;
10078 }
10079 default:
10080 if (followSubRegDef(*MI, RSR)) {
10081 if (!RSR.Reg)
10082 return nullptr;
10083 DefInst = MRI.getVRegDef(RSR.Reg);
10084 }
10085 }
10086 if (!DefInst)
10087 return MI;
10088 }
10089 return nullptr;
10090}
10091
10093 Register VReg,
10094 const MachineInstr &DefMI,
10095 const MachineInstr &UseMI) {
10096 assert(MRI.isSSA() && "Must be run on SSA");
10097
10098 auto *TRI = MRI.getTargetRegisterInfo();
10099 auto *DefBB = DefMI.getParent();
10100
10101 // Don't bother searching between blocks, although it is possible this block
10102 // doesn't modify exec.
10103 if (UseMI.getParent() != DefBB)
10104 return true;
10105
10106 const int MaxInstScan = 20;
10107 int NumInst = 0;
10108
10109 // Stop scan at the use.
10110 auto E = UseMI.getIterator();
10111 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10112 if (I->isDebugInstr())
10113 continue;
10114
10115 if (++NumInst > MaxInstScan)
10116 return true;
10117
10118 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10119 return true;
10120 }
10121
10122 return false;
10123}
10124
10126 Register VReg,
10127 const MachineInstr &DefMI) {
10128 assert(MRI.isSSA() && "Must be run on SSA");
10129
10130 auto *TRI = MRI.getTargetRegisterInfo();
10131 auto *DefBB = DefMI.getParent();
10132
10133 const int MaxUseScan = 10;
10134 int NumUse = 0;
10135
10136 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10137 auto &UseInst = *Use.getParent();
10138 // Don't bother searching between blocks, although it is possible this block
10139 // doesn't modify exec.
10140 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10141 return true;
10142
10143 if (++NumUse > MaxUseScan)
10144 return true;
10145 }
10146
10147 if (NumUse == 0)
10148 return false;
10149
10150 const int MaxInstScan = 20;
10151 int NumInst = 0;
10152
10153 // Stop scan when we have seen all the uses.
10154 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10155 assert(I != DefBB->end());
10156
10157 if (I->isDebugInstr())
10158 continue;
10159
10160 if (++NumInst > MaxInstScan)
10161 return true;
10162
10163 for (const MachineOperand &Op : I->operands()) {
10164 // We don't check reg masks here as they're used only on calls:
10165 // 1. EXEC is only considered const within one BB
10166 // 2. Call should be a terminator instruction if present in a BB
10167
10168 if (!Op.isReg())
10169 continue;
10170
10171 Register Reg = Op.getReg();
10172 if (Op.isUse()) {
10173 if (Reg == VReg && --NumUse == 0)
10174 return false;
10175 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10176 return true;
10177 }
10178 }
10179}
10180
10183 const DebugLoc &DL, Register Src, Register Dst) const {
10184 auto Cur = MBB.begin();
10185 if (Cur != MBB.end())
10186 do {
10187 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10188 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10189 ++Cur;
10190 } while (Cur != MBB.end() && Cur != LastPHIIt);
10191
10192 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10193 Dst);
10194}
10195
10198 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10199 if (InsPt != MBB.end() &&
10200 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10201 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10202 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10203 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10204 InsPt++;
10205 return BuildMI(MBB, InsPt, DL,
10206 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
10207 : AMDGPU::S_MOV_B64_term),
10208 Dst)
10209 .addReg(Src, 0, SrcSubReg)
10210 .addReg(AMDGPU::EXEC, RegState::Implicit);
10211 }
10212 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10213 Dst);
10214}
10215
10216bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10217
10220 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10221 VirtRegMap *VRM) const {
10222 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10223 //
10224 // %0:sreg_32 = COPY $m0
10225 //
10226 // We explicitly chose SReg_32 for the virtual register so such a copy might
10227 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10228 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10229 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10230 // TargetInstrInfo::foldMemoryOperand() is going to try.
10231 // A similar issue also exists with spilling and reloading $exec registers.
10232 //
10233 // To prevent that, constrain the %0 register class here.
10234 if (isFullCopyInstr(MI)) {
10235 Register DstReg = MI.getOperand(0).getReg();
10236 Register SrcReg = MI.getOperand(1).getReg();
10237 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10238 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10240 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10241 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10242 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10243 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10244 return nullptr;
10245 }
10246 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10247 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10248 return nullptr;
10249 }
10250 }
10251 }
10252
10253 return nullptr;
10254}
10255
10257 const MachineInstr &MI,
10258 unsigned *PredCost) const {
10259 if (MI.isBundle()) {
10261 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10262 unsigned Lat = 0, Count = 0;
10263 for (++I; I != E && I->isBundledWithPred(); ++I) {
10264 ++Count;
10265 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10266 }
10267 return Lat + Count - 1;
10268 }
10269
10270 return SchedModel.computeInstrLatency(&MI);
10271}
10272
10275 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10276 unsigned Opcode = MI.getOpcode();
10277
10278 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10279 Register Dst = MI.getOperand(0).getReg();
10280 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10281 : MI.getOperand(1).getReg();
10282 LLT DstTy = MRI.getType(Dst);
10283 LLT SrcTy = MRI.getType(Src);
10284 unsigned DstAS = DstTy.getAddressSpace();
10285 unsigned SrcAS = SrcTy.getAddressSpace();
10286 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10287 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10288 ST.hasGloballyAddressableScratch()
10291 };
10292
10293 // If the target supports globally addressable scratch, the mapping from
10294 // scratch memory to the flat aperture changes therefore an address space cast
10295 // is no longer uniform.
10296 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10297 return HandleAddrSpaceCast(MI);
10298
10299 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10300 auto IID = GI->getIntrinsicID();
10305
10306 switch (IID) {
10307 case Intrinsic::amdgcn_addrspacecast_nonnull:
10308 return HandleAddrSpaceCast(MI);
10309 case Intrinsic::amdgcn_if:
10310 case Intrinsic::amdgcn_else:
10311 // FIXME: Uniform if second result
10312 break;
10313 }
10314
10316 }
10317
10318 // Loads from the private and flat address spaces are divergent, because
10319 // threads can execute the load instruction with the same inputs and get
10320 // different results.
10321 //
10322 // All other loads are not divergent, because if threads issue loads with the
10323 // same arguments, they will always get the same result.
10324 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10325 Opcode == AMDGPU::G_SEXTLOAD) {
10326 if (MI.memoperands_empty())
10327 return InstructionUniformity::NeverUniform; // conservative assumption
10328
10329 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10330 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10331 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10332 })) {
10333 // At least one MMO in a non-global address space.
10335 }
10337 }
10338
10339 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10340 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10341 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10342 AMDGPU::isGenericAtomic(Opcode)) {
10344 }
10346}
10347
10350
10351 if (isNeverUniform(MI))
10353
10354 unsigned opcode = MI.getOpcode();
10355 if (opcode == AMDGPU::V_READLANE_B32 ||
10356 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10357 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10359
10360 if (isCopyInstr(MI)) {
10361 const MachineOperand &srcOp = MI.getOperand(1);
10362 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10363 const TargetRegisterClass *regClass =
10364 RI.getPhysRegBaseClass(srcOp.getReg());
10365 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10367 }
10369 }
10370
10371 // GMIR handling
10372 if (MI.isPreISelOpcode())
10374
10375 // Atomics are divergent because they are executed sequentially: when an
10376 // atomic operation refers to the same address in each thread, then each
10377 // thread after the first sees the value written by the previous thread as
10378 // original value.
10379
10380 if (isAtomic(MI))
10382
10383 // Loads from the private and flat address spaces are divergent, because
10384 // threads can execute the load instruction with the same inputs and get
10385 // different results.
10386 if (isFLAT(MI) && MI.mayLoad()) {
10387 if (MI.memoperands_empty())
10388 return InstructionUniformity::NeverUniform; // conservative assumption
10389
10390 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10391 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10392 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10393 })) {
10394 // At least one MMO in a non-global address space.
10396 }
10397
10399 }
10400
10401 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
10402 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10403
10404 // FIXME: It's conceptually broken to report this for an instruction, and not
10405 // a specific def operand. For inline asm in particular, there could be mixed
10406 // uniform and divergent results.
10407 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10408 const MachineOperand &SrcOp = MI.getOperand(I);
10409 if (!SrcOp.isReg())
10410 continue;
10411
10412 Register Reg = SrcOp.getReg();
10413 if (!Reg || !SrcOp.readsReg())
10414 continue;
10415
10416 // If RegBank is null, this is unassigned or an unallocatable special
10417 // register, which are all scalars.
10418 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10419 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10421 }
10422
10423 // TODO: Uniformity check condtions above can be rearranged for more
10424 // redability
10425
10426 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10427 // currently turned into no-op COPYs by SelectionDAG ISel and are
10428 // therefore no longer recognizable.
10429
10431}
10432
10434 switch (MF.getFunction().getCallingConv()) {
10436 return 1;
10438 return 2;
10440 return 3;
10444 const Function &F = MF.getFunction();
10445 F.getContext().diagnose(DiagnosticInfoUnsupported(
10446 F, "ds_ordered_count unsupported for this calling conv"));
10447 [[fallthrough]];
10448 }
10451 case CallingConv::C:
10452 case CallingConv::Fast:
10453 default:
10454 // Assume other calling conventions are various compute callable functions
10455 return 0;
10456 }
10457}
10458
10460 Register &SrcReg2, int64_t &CmpMask,
10461 int64_t &CmpValue) const {
10462 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10463 return false;
10464
10465 switch (MI.getOpcode()) {
10466 default:
10467 break;
10468 case AMDGPU::S_CMP_EQ_U32:
10469 case AMDGPU::S_CMP_EQ_I32:
10470 case AMDGPU::S_CMP_LG_U32:
10471 case AMDGPU::S_CMP_LG_I32:
10472 case AMDGPU::S_CMP_LT_U32:
10473 case AMDGPU::S_CMP_LT_I32:
10474 case AMDGPU::S_CMP_GT_U32:
10475 case AMDGPU::S_CMP_GT_I32:
10476 case AMDGPU::S_CMP_LE_U32:
10477 case AMDGPU::S_CMP_LE_I32:
10478 case AMDGPU::S_CMP_GE_U32:
10479 case AMDGPU::S_CMP_GE_I32:
10480 case AMDGPU::S_CMP_EQ_U64:
10481 case AMDGPU::S_CMP_LG_U64:
10482 SrcReg = MI.getOperand(0).getReg();
10483 if (MI.getOperand(1).isReg()) {
10484 if (MI.getOperand(1).getSubReg())
10485 return false;
10486 SrcReg2 = MI.getOperand(1).getReg();
10487 CmpValue = 0;
10488 } else if (MI.getOperand(1).isImm()) {
10489 SrcReg2 = Register();
10490 CmpValue = MI.getOperand(1).getImm();
10491 } else {
10492 return false;
10493 }
10494 CmpMask = ~0;
10495 return true;
10496 case AMDGPU::S_CMPK_EQ_U32:
10497 case AMDGPU::S_CMPK_EQ_I32:
10498 case AMDGPU::S_CMPK_LG_U32:
10499 case AMDGPU::S_CMPK_LG_I32:
10500 case AMDGPU::S_CMPK_LT_U32:
10501 case AMDGPU::S_CMPK_LT_I32:
10502 case AMDGPU::S_CMPK_GT_U32:
10503 case AMDGPU::S_CMPK_GT_I32:
10504 case AMDGPU::S_CMPK_LE_U32:
10505 case AMDGPU::S_CMPK_LE_I32:
10506 case AMDGPU::S_CMPK_GE_U32:
10507 case AMDGPU::S_CMPK_GE_I32:
10508 SrcReg = MI.getOperand(0).getReg();
10509 SrcReg2 = Register();
10510 CmpValue = MI.getOperand(1).getImm();
10511 CmpMask = ~0;
10512 return true;
10513 }
10514
10515 return false;
10516}
10517
10519 Register SrcReg2, int64_t CmpMask,
10520 int64_t CmpValue,
10521 const MachineRegisterInfo *MRI) const {
10522 if (!SrcReg || SrcReg.isPhysical())
10523 return false;
10524
10525 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10526 return false;
10527
10528 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
10529 this](int64_t ExpectedValue, unsigned SrcSize,
10530 bool IsReversible, bool IsSigned) -> bool {
10531 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10532 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10533 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10534 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10535 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
10536 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10537 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10538 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10539 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10540 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
10541 //
10542 // Signed ge/gt are not used for the sign bit.
10543 //
10544 // If result of the AND is unused except in the compare:
10545 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
10546 //
10547 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10548 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10549 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
10550 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10551 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10552 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
10553
10554 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
10555 if (!Def || Def->getParent() != CmpInstr.getParent())
10556 return false;
10557
10558 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
10559 Def->getOpcode() != AMDGPU::S_AND_B64)
10560 return false;
10561
10562 int64_t Mask;
10563 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
10564 if (MO->isImm())
10565 Mask = MO->getImm();
10566 else if (!getFoldableImm(MO, Mask))
10567 return false;
10568 Mask &= maxUIntN(SrcSize);
10569 return isPowerOf2_64(Mask);
10570 };
10571
10572 MachineOperand *SrcOp = &Def->getOperand(1);
10573 if (isMask(SrcOp))
10574 SrcOp = &Def->getOperand(2);
10575 else if (isMask(&Def->getOperand(2)))
10576 SrcOp = &Def->getOperand(1);
10577 else
10578 return false;
10579
10580 // A valid Mask is required to have a single bit set, hence a non-zero and
10581 // power-of-two value. This verifies that we will not do 64-bit shift below.
10582 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
10583 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
10584 if (IsSigned && BitNo == SrcSize - 1)
10585 return false;
10586
10587 ExpectedValue <<= BitNo;
10588
10589 bool IsReversedCC = false;
10590 if (CmpValue != ExpectedValue) {
10591 if (!IsReversible)
10592 return false;
10593 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
10594 if (!IsReversedCC)
10595 return false;
10596 }
10597
10598 Register DefReg = Def->getOperand(0).getReg();
10599 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
10600 return false;
10601
10602 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
10603 I != E; ++I) {
10604 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
10605 I->killsRegister(AMDGPU::SCC, &RI))
10606 return false;
10607 }
10608
10609 MachineOperand *SccDef =
10610 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
10611 SccDef->setIsDead(false);
10612 CmpInstr.eraseFromParent();
10613
10614 if (!MRI->use_nodbg_empty(DefReg)) {
10615 assert(!IsReversedCC);
10616 return true;
10617 }
10618
10619 // Replace AND with unused result with a S_BITCMP.
10620 MachineBasicBlock *MBB = Def->getParent();
10621
10622 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
10623 : AMDGPU::S_BITCMP1_B32
10624 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
10625 : AMDGPU::S_BITCMP1_B64;
10626
10627 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
10628 .add(*SrcOp)
10629 .addImm(BitNo);
10630 Def->eraseFromParent();
10631
10632 return true;
10633 };
10634
10635 switch (CmpInstr.getOpcode()) {
10636 default:
10637 break;
10638 case AMDGPU::S_CMP_EQ_U32:
10639 case AMDGPU::S_CMP_EQ_I32:
10640 case AMDGPU::S_CMPK_EQ_U32:
10641 case AMDGPU::S_CMPK_EQ_I32:
10642 return optimizeCmpAnd(1, 32, true, false);
10643 case AMDGPU::S_CMP_GE_U32:
10644 case AMDGPU::S_CMPK_GE_U32:
10645 return optimizeCmpAnd(1, 32, false, false);
10646 case AMDGPU::S_CMP_GE_I32:
10647 case AMDGPU::S_CMPK_GE_I32:
10648 return optimizeCmpAnd(1, 32, false, true);
10649 case AMDGPU::S_CMP_EQ_U64:
10650 return optimizeCmpAnd(1, 64, true, false);
10651 case AMDGPU::S_CMP_LG_U32:
10652 case AMDGPU::S_CMP_LG_I32:
10653 case AMDGPU::S_CMPK_LG_U32:
10654 case AMDGPU::S_CMPK_LG_I32:
10655 return optimizeCmpAnd(0, 32, true, false);
10656 case AMDGPU::S_CMP_GT_U32:
10657 case AMDGPU::S_CMPK_GT_U32:
10658 return optimizeCmpAnd(0, 32, false, false);
10659 case AMDGPU::S_CMP_GT_I32:
10660 case AMDGPU::S_CMPK_GT_I32:
10661 return optimizeCmpAnd(0, 32, false, true);
10662 case AMDGPU::S_CMP_LG_U64:
10663 return optimizeCmpAnd(0, 64, true, false);
10664 }
10665
10666 return false;
10667}
10668
10670 AMDGPU::OpName OpName) const {
10671 if (!ST.needsAlignedVGPRs())
10672 return;
10673
10674 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
10675 if (OpNo < 0)
10676 return;
10677 MachineOperand &Op = MI.getOperand(OpNo);
10678 if (getOpSize(MI, OpNo) > 4)
10679 return;
10680
10681 // Add implicit aligned super-reg to force alignment on the data operand.
10682 const DebugLoc &DL = MI.getDebugLoc();
10683 MachineBasicBlock *BB = MI.getParent();
10685 Register DataReg = Op.getReg();
10686 bool IsAGPR = RI.isAGPR(MRI, DataReg);
10687 Register Undef = MRI.createVirtualRegister(
10688 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10689 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
10690 Register NewVR =
10691 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10692 : &AMDGPU::VReg_64_Align2RegClass);
10693 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
10694 .addReg(DataReg, 0, Op.getSubReg())
10695 .addImm(AMDGPU::sub0)
10696 .addReg(Undef)
10697 .addImm(AMDGPU::sub1);
10698 Op.setReg(NewVR);
10699 Op.setSubReg(AMDGPU::sub0);
10700 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
10701}
10702
10704 if (isIGLP(*MI))
10705 return false;
10706
10708}
10709
10711 if (!isWMMA(MI) && !isSWMMAC(MI))
10712 return false;
10713
10714 if (AMDGPU::isGFX1250(ST))
10715 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
10716
10717 return true;
10718}
10719
10721 unsigned Opcode = MI.getOpcode();
10722
10723 if (AMDGPU::isGFX12Plus(ST))
10724 return isDOT(MI) || isXDLWMMA(MI);
10725
10726 if (!isMAI(MI) || isDGEMM(Opcode) ||
10727 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
10728 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
10729 return false;
10730
10731 if (!ST.hasGFX940Insts())
10732 return true;
10733
10734 return AMDGPU::getMAIIsGFX940XDL(Opcode);
10735}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MCInstrDesc &TID, unsigned RCID)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
#define LLVM_DEBUG(...)
Definition Debug.h:114
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition ArrayRef.h:150
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasAddNoCarry() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
bool mayStore() const
Return true if this instruction could possibly modify memory.
bool mayLoad() const
Return true if this instruction could possibly read memory.
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:102
constexpr bool isValid() const
Definition Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
bool isSpill(uint16_t Opcode) const
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI) const override
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:281
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:194
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:174
self_iterator getIterator()
Definition ilist_node.h:134
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST)
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:231
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:202
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:206
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:201
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:216
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:205
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:212
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:248
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:204
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:242
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:213
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:220
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:203
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:228
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:569
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:571
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:568
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:570
@ TI_CONSTDATA_START
Definition AMDGPU.h:567
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:310
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:262
@ Offset
Definition DWP.cpp:477
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1707
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:307
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:216
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2454
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:626
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
Op::Description Desc
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:157
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1714
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
auto reverse(ContainerTy &&C)
Definition STLExtras.h:400
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:257
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:127
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1879
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:583
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
MachineCycleInfo::CycleT MachineCycle
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:219
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:56
MachineInstr * top() const
Definition SIInstrInfo.h:61
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:80
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.