LLVM 22.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
61 CodeGenCoverage *CoverageInfo,
63 BlockFrequencyInfo *BFI) {
64 MRI = &MF.getRegInfo();
65 Subtarget = &MF.getSubtarget<GCNSubtarget>();
68}
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
85 dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
120 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
121}
122
123bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
124 const DebugLoc &DL = I.getDebugLoc();
125 MachineBasicBlock *BB = I.getParent();
126 I.setDesc(TII.get(TargetOpcode::COPY));
127
128 const MachineOperand &Src = I.getOperand(1);
129 MachineOperand &Dst = I.getOperand(0);
130 Register DstReg = Dst.getReg();
131 Register SrcReg = Src.getReg();
132
133 if (isVCC(DstReg, *MRI)) {
134 if (SrcReg == AMDGPU::SCC) {
135 const TargetRegisterClass *RC
136 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
137 if (!RC)
138 return true;
139 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
140 }
141
142 if (!isVCC(SrcReg, *MRI)) {
143 // TODO: Should probably leave the copy and let copyPhysReg expand it.
144 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
145 return false;
146
147 const TargetRegisterClass *SrcRC
148 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
149
150 std::optional<ValueAndVReg> ConstVal =
151 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
152 if (ConstVal) {
153 unsigned MovOpc =
154 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
155 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
156 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
157 } else {
158 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
159
160 // We can't trust the high bits at this point, so clear them.
161
162 // TODO: Skip masking high bits if def is known boolean.
163
164 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
165 assert(Subtarget->useRealTrue16Insts());
166 const int64_t NoMods = 0;
167 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
168 .addImm(NoMods)
169 .addImm(1)
170 .addImm(NoMods)
171 .addReg(SrcReg)
172 .addImm(NoMods);
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
174 .addImm(NoMods)
175 .addImm(0)
176 .addImm(NoMods)
177 .addReg(MaskedReg)
178 .addImm(NoMods);
179 } else {
180 bool IsSGPR = TRI.isSGPRClass(SrcRC);
181 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
182 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
183 .addImm(1)
184 .addReg(SrcReg);
185 if (IsSGPR)
186 And.setOperandDead(3); // Dead scc
187
188 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
189 .addImm(0)
190 .addReg(MaskedReg);
191 }
192 }
193
194 if (!MRI->getRegClassOrNull(SrcReg))
195 MRI->setRegClass(SrcReg, SrcRC);
196 I.eraseFromParent();
197 return true;
198 }
199
200 const TargetRegisterClass *RC =
202 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
203 return false;
204
205 return true;
206 }
207
208 for (const MachineOperand &MO : I.operands()) {
209 if (MO.getReg().isPhysical())
210 continue;
211
212 const TargetRegisterClass *RC =
214 if (!RC)
215 continue;
216 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
217 }
218 return true;
219}
220
221bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
222 const DebugLoc &DL = I.getDebugLoc();
223 MachineBasicBlock *BB = I.getParent();
224
225 unsigned CmpOpc =
226 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
227 MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
228 .addReg(I.getOperand(1).getReg())
229 .addImm(0);
230 if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
231 return false;
232
233 Register DstReg = I.getOperand(0).getReg();
234 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
235
236 I.eraseFromParent();
237 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
238}
239
240bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
241 const DebugLoc &DL = I.getDebugLoc();
242 MachineBasicBlock *BB = I.getParent();
243
244 Register DstReg = I.getOperand(0).getReg();
245 Register SrcReg = I.getOperand(1).getReg();
246 std::optional<ValueAndVReg> Arg =
247 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
248
249 if (Arg) {
250 const int64_t Value = Arg->Value.getZExtValue();
251 if (Value == 0) {
252 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
253 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
254 } else {
255 assert(Value == 1);
256 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
257 }
258 I.eraseFromParent();
259 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
260 }
261
262 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
263 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
264
265 unsigned SelectOpcode =
266 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
267 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
268 .addReg(TRI.getExec())
269 .addImm(0);
270
271 I.eraseFromParent();
272 return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
273}
274
275bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
276 Register DstReg = I.getOperand(0).getReg();
277 Register SrcReg = I.getOperand(1).getReg();
278
279 const DebugLoc &DL = I.getDebugLoc();
280 MachineBasicBlock *BB = I.getParent();
281
282 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
283 .addReg(SrcReg);
284
285 I.eraseFromParent();
286 return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
287}
288
289bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
290 const Register DefReg = I.getOperand(0).getReg();
291 const LLT DefTy = MRI->getType(DefReg);
292
293 // S1 G_PHIs should not be selected in instruction-select, instead:
294 // - divergent S1 G_PHI should go through lane mask merging algorithm
295 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
296 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
297 if (DefTy == LLT::scalar(1))
298 return false;
299
300 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
301
302 const RegClassOrRegBank &RegClassOrBank =
303 MRI->getRegClassOrRegBank(DefReg);
304
305 const TargetRegisterClass *DefRC =
306 dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
307 if (!DefRC) {
308 if (!DefTy.isValid()) {
309 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
310 return false;
311 }
312
313 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
314 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
315 if (!DefRC) {
316 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
317 return false;
318 }
319 }
320
321 // If inputs have register bank, assign corresponding reg class.
322 // Note: registers don't need to have the same reg bank.
323 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
324 const Register SrcReg = I.getOperand(i).getReg();
325
326 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
327 if (RB) {
328 const LLT SrcTy = MRI->getType(SrcReg);
329 const TargetRegisterClass *SrcRC =
330 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
331 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
332 return false;
333 }
334 }
335
336 I.setDesc(TII.get(TargetOpcode::PHI));
337 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
338}
339
341AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
342 const TargetRegisterClass &SubRC,
343 unsigned SubIdx) const {
344
345 MachineInstr *MI = MO.getParent();
347 Register DstReg = MRI->createVirtualRegister(&SubRC);
348
349 if (MO.isReg()) {
350 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
351 Register Reg = MO.getReg();
352 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
353 .addReg(Reg, 0, ComposedSubIdx);
354
355 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
356 MO.isKill(), MO.isDead(), MO.isUndef(),
357 MO.isEarlyClobber(), 0, MO.isDebug(),
358 MO.isInternalRead());
359 }
360
361 assert(MO.isImm());
362
363 APInt Imm(64, MO.getImm());
364
365 switch (SubIdx) {
366 default:
367 llvm_unreachable("do not know to split immediate with this sub index.");
368 case AMDGPU::sub0:
369 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
370 case AMDGPU::sub1:
371 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
372 }
373}
374
375static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
376 switch (Opc) {
377 case AMDGPU::G_AND:
378 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
379 case AMDGPU::G_OR:
380 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
381 case AMDGPU::G_XOR:
382 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
383 default:
384 llvm_unreachable("not a bit op");
385 }
386}
387
388bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
389 Register DstReg = I.getOperand(0).getReg();
390 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
391
392 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
393 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
394 DstRB->getID() != AMDGPU::VCCRegBankID)
395 return false;
396
397 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
398 STI.isWave64());
399 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
400
401 // Dead implicit-def of scc
402 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
403 true, // isImp
404 false, // isKill
405 true)); // isDead
406 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
407}
408
409bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
410 MachineBasicBlock *BB = I.getParent();
412 Register DstReg = I.getOperand(0).getReg();
413 const DebugLoc &DL = I.getDebugLoc();
414 LLT Ty = MRI->getType(DstReg);
415 if (Ty.isVector())
416 return false;
417
418 unsigned Size = Ty.getSizeInBits();
419 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
420 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
421 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
422
423 if (Size == 32) {
424 if (IsSALU) {
425 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
427 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
428 .add(I.getOperand(1))
429 .add(I.getOperand(2))
430 .setOperandDead(3); // Dead scc
431 I.eraseFromParent();
432 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
433 }
434
435 if (STI.hasAddNoCarry()) {
436 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
437 I.setDesc(TII.get(Opc));
438 I.addOperand(*MF, MachineOperand::CreateImm(0));
439 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
440 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
441 }
442
443 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
444
445 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
447 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
448 .addDef(UnusedCarry, RegState::Dead)
449 .add(I.getOperand(1))
450 .add(I.getOperand(2))
451 .addImm(0);
452 I.eraseFromParent();
453 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
454 }
455
456 assert(!Sub && "illegal sub should not reach here");
457
458 const TargetRegisterClass &RC
459 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
460 const TargetRegisterClass &HalfRC
461 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
462
463 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
464 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
465 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
466 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
467
468 Register DstLo = MRI->createVirtualRegister(&HalfRC);
469 Register DstHi = MRI->createVirtualRegister(&HalfRC);
470
471 if (IsSALU) {
472 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
473 .add(Lo1)
474 .add(Lo2);
475 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
476 .add(Hi1)
477 .add(Hi2)
478 .setOperandDead(3); // Dead scc
479 } else {
480 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
481 Register CarryReg = MRI->createVirtualRegister(CarryRC);
482 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
483 .addDef(CarryReg)
484 .add(Lo1)
485 .add(Lo2)
486 .addImm(0);
487 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
488 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
489 .add(Hi1)
490 .add(Hi2)
491 .addReg(CarryReg, RegState::Kill)
492 .addImm(0);
493
494 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
495 return false;
496 }
497
498 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
499 .addReg(DstLo)
500 .addImm(AMDGPU::sub0)
501 .addReg(DstHi)
502 .addImm(AMDGPU::sub1);
503
504
505 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
506 return false;
507
508 I.eraseFromParent();
509 return true;
510}
511
512bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
513 MachineInstr &I) const {
514 MachineBasicBlock *BB = I.getParent();
516 const DebugLoc &DL = I.getDebugLoc();
517 Register Dst0Reg = I.getOperand(0).getReg();
518 Register Dst1Reg = I.getOperand(1).getReg();
519 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
520 I.getOpcode() == AMDGPU::G_UADDE;
521 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
522 I.getOpcode() == AMDGPU::G_USUBE;
523
524 if (isVCC(Dst1Reg, *MRI)) {
525 unsigned NoCarryOpc =
526 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
527 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
528 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
529 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
530 I.addOperand(*MF, MachineOperand::CreateImm(0));
531 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
532 }
533
534 Register Src0Reg = I.getOperand(2).getReg();
535 Register Src1Reg = I.getOperand(3).getReg();
536
537 if (HasCarryIn) {
538 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
539 .addReg(I.getOperand(4).getReg());
540 }
541
542 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
543 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
544
545 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
546 .add(I.getOperand(2))
547 .add(I.getOperand(3));
548
549 if (MRI->use_nodbg_empty(Dst1Reg)) {
550 CarryInst.setOperandDead(3); // Dead scc
551 } else {
552 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
553 .addReg(AMDGPU::SCC);
554 if (!MRI->getRegClassOrNull(Dst1Reg))
555 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
556 }
557
558 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
559 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
560 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
561 return false;
562
563 if (HasCarryIn &&
564 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
565 AMDGPU::SReg_32RegClass, *MRI))
566 return false;
567
568 I.eraseFromParent();
569 return true;
570}
571
572bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
573 MachineInstr &I) const {
574 MachineBasicBlock *BB = I.getParent();
576 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
577 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
578 MRI->use_nodbg_empty(I.getOperand(1).getReg());
579
580 unsigned Opc;
581 if (Subtarget->hasMADIntraFwdBug())
582 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
583 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
584 else if (UseNoCarry)
585 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
586 : AMDGPU::V_MAD_NC_I64_I32_e64;
587 else
588 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
589
590 if (UseNoCarry)
591 I.removeOperand(1);
592
593 I.setDesc(TII.get(Opc));
594 I.addOperand(*MF, MachineOperand::CreateImm(0));
595 I.addImplicitDefUseOperands(*MF);
596 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
597}
598
599// TODO: We should probably legalize these to only using 32-bit results.
600bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
601 MachineBasicBlock *BB = I.getParent();
602 Register DstReg = I.getOperand(0).getReg();
603 Register SrcReg = I.getOperand(1).getReg();
604 LLT DstTy = MRI->getType(DstReg);
605 LLT SrcTy = MRI->getType(SrcReg);
606 const unsigned SrcSize = SrcTy.getSizeInBits();
607 unsigned DstSize = DstTy.getSizeInBits();
608
609 // TODO: Should handle any multiple of 32 offset.
610 unsigned Offset = I.getOperand(2).getImm();
611 if (Offset % 32 != 0 || DstSize > 128)
612 return false;
613
614 // 16-bit operations really use 32-bit registers.
615 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
616 if (DstSize == 16)
617 DstSize = 32;
618
619 const TargetRegisterClass *DstRC =
620 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
621 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
622 return false;
623
624 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
625 const TargetRegisterClass *SrcRC =
626 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
627 if (!SrcRC)
628 return false;
630 DstSize / 32);
631 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
632 if (!SrcRC)
633 return false;
634
635 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
636 *SrcRC, I.getOperand(1));
637 const DebugLoc &DL = I.getDebugLoc();
638 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
639 .addReg(SrcReg, 0, SubReg);
640
641 I.eraseFromParent();
642 return true;
643}
644
645bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
646 MachineBasicBlock *BB = MI.getParent();
647 Register DstReg = MI.getOperand(0).getReg();
648 LLT DstTy = MRI->getType(DstReg);
649 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
650
651 const unsigned SrcSize = SrcTy.getSizeInBits();
652 if (SrcSize < 32)
653 return selectImpl(MI, *CoverageInfo);
654
655 const DebugLoc &DL = MI.getDebugLoc();
656 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
657 const unsigned DstSize = DstTy.getSizeInBits();
658 const TargetRegisterClass *DstRC =
659 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
660 if (!DstRC)
661 return false;
662
663 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
665 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
666 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
667 MachineOperand &Src = MI.getOperand(I + 1);
668 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
669 MIB.addImm(SubRegs[I]);
670
671 const TargetRegisterClass *SrcRC
672 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
673 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
674 return false;
675 }
676
677 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
678 return false;
679
680 MI.eraseFromParent();
681 return true;
682}
683
684bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
685 MachineBasicBlock *BB = MI.getParent();
686 const int NumDst = MI.getNumOperands() - 1;
687
688 MachineOperand &Src = MI.getOperand(NumDst);
689
690 Register SrcReg = Src.getReg();
691 Register DstReg0 = MI.getOperand(0).getReg();
692 LLT DstTy = MRI->getType(DstReg0);
693 LLT SrcTy = MRI->getType(SrcReg);
694
695 const unsigned DstSize = DstTy.getSizeInBits();
696 const unsigned SrcSize = SrcTy.getSizeInBits();
697 const DebugLoc &DL = MI.getDebugLoc();
698 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
699
700 const TargetRegisterClass *SrcRC =
701 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
702 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
703 return false;
704
705 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
706 // source, and this relies on the fact that the same subregister indices are
707 // used for both.
708 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
709 for (int I = 0, E = NumDst; I != E; ++I) {
710 MachineOperand &Dst = MI.getOperand(I);
711 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
712 .addReg(SrcReg, 0, SubRegs[I]);
713
714 // Make sure the subregister index is valid for the source register.
715 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
716 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
717 return false;
718
719 const TargetRegisterClass *DstRC =
721 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
722 return false;
723 }
724
725 MI.eraseFromParent();
726 return true;
727}
728
729bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
730 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
731 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
732
733 Register Src0 = MI.getOperand(1).getReg();
734 Register Src1 = MI.getOperand(2).getReg();
735 LLT SrcTy = MRI->getType(Src0);
736 const unsigned SrcSize = SrcTy.getSizeInBits();
737
738 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
739 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
740 return selectG_MERGE_VALUES(MI);
741 }
742
743 // Selection logic below is for V2S16 only.
744 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
745 Register Dst = MI.getOperand(0).getReg();
746 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
747 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
748 SrcTy != LLT::scalar(32)))
749 return selectImpl(MI, *CoverageInfo);
750
751 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
752 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
753 return false;
754
755 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
756 DstBank->getID() == AMDGPU::VGPRRegBankID);
757 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
758
759 const DebugLoc &DL = MI.getDebugLoc();
760 MachineBasicBlock *BB = MI.getParent();
761
762 // First, before trying TableGen patterns, check if both sources are
763 // constants. In those cases, we can trivially compute the final constant
764 // and emit a simple move.
765 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
766 if (ConstSrc1) {
767 auto ConstSrc0 =
768 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
769 if (ConstSrc0) {
770 const int64_t K0 = ConstSrc0->Value.getSExtValue();
771 const int64_t K1 = ConstSrc1->Value.getSExtValue();
772 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
773 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
774 uint32_t Imm = Lo16 | (Hi16 << 16);
775
776 // VALU
777 if (IsVector) {
778 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
779 MI.eraseFromParent();
780 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
781 }
782
783 // SALU
784 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
785 MI.eraseFromParent();
786 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
787 }
788 }
789
790 // Now try TableGen patterns.
791 if (selectImpl(MI, *CoverageInfo))
792 return true;
793
794 // TODO: This should probably be a combine somewhere
795 // (build_vector $src0, undef) -> copy $src0
796 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
797 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
798 MI.setDesc(TII.get(AMDGPU::COPY));
799 MI.removeOperand(2);
800 const auto &RC =
801 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
802 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
803 RBI.constrainGenericRegister(Src0, RC, *MRI);
804 }
805
806 // TODO: Can be improved?
807 if (IsVector) {
808 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
809 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
810 .addImm(0xFFFF)
811 .addReg(Src0);
812 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
813 return false;
814
815 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
816 .addReg(Src1)
817 .addImm(16)
818 .addReg(TmpReg);
819 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
820 return false;
821
822 MI.eraseFromParent();
823 return true;
824 }
825
826 Register ShiftSrc0;
827 Register ShiftSrc1;
828
829 // With multiple uses of the shift, this will duplicate the shift and
830 // increase register pressure.
831 //
832 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
833 // => (S_PACK_HH_B32_B16 $src0, $src1)
834 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
835 // => (S_PACK_HL_B32_B16 $src0, $src1)
836 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
837 // => (S_PACK_LH_B32_B16 $src0, $src1)
838 // (build_vector $src0, $src1)
839 // => (S_PACK_LL_B32_B16 $src0, $src1)
840
841 bool Shift0 = mi_match(
842 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
843
844 bool Shift1 = mi_match(
845 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
846
847 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
848 if (Shift0 && Shift1) {
849 Opc = AMDGPU::S_PACK_HH_B32_B16;
850 MI.getOperand(1).setReg(ShiftSrc0);
851 MI.getOperand(2).setReg(ShiftSrc1);
852 } else if (Shift1) {
853 Opc = AMDGPU::S_PACK_LH_B32_B16;
854 MI.getOperand(2).setReg(ShiftSrc1);
855 } else if (Shift0) {
856 auto ConstSrc1 =
857 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
858 if (ConstSrc1 && ConstSrc1->Value == 0) {
859 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
860 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
861 .addReg(ShiftSrc0)
862 .addImm(16)
863 .setOperandDead(3); // Dead scc
864
865 MI.eraseFromParent();
866 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
867 }
868 if (STI.hasSPackHL()) {
869 Opc = AMDGPU::S_PACK_HL_B32_B16;
870 MI.getOperand(1).setReg(ShiftSrc0);
871 }
872 }
873
874 MI.setDesc(TII.get(Opc));
875 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
876}
877
878bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
879 const MachineOperand &MO = I.getOperand(0);
880
881 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
882 // regbank check here is to know why getConstrainedRegClassForOperand failed.
884 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
885 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
886 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
887 return true;
888 }
889
890 return false;
891}
892
893bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
894 MachineBasicBlock *BB = I.getParent();
895
896 Register DstReg = I.getOperand(0).getReg();
897 Register Src0Reg = I.getOperand(1).getReg();
898 Register Src1Reg = I.getOperand(2).getReg();
899 LLT Src1Ty = MRI->getType(Src1Reg);
900
901 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
902 unsigned InsSize = Src1Ty.getSizeInBits();
903
904 int64_t Offset = I.getOperand(3).getImm();
905
906 // FIXME: These cases should have been illegal and unnecessary to check here.
907 if (Offset % 32 != 0 || InsSize % 32 != 0)
908 return false;
909
910 // Currently not handled by getSubRegFromChannel.
911 if (InsSize > 128)
912 return false;
913
914 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
915 if (SubReg == AMDGPU::NoSubRegister)
916 return false;
917
918 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
919 const TargetRegisterClass *DstRC =
920 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
921 if (!DstRC)
922 return false;
923
924 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
925 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
926 const TargetRegisterClass *Src0RC =
927 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
928 const TargetRegisterClass *Src1RC =
929 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
930
931 // Deal with weird cases where the class only partially supports the subreg
932 // index.
933 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
934 if (!Src0RC || !Src1RC)
935 return false;
936
937 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
938 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
939 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
940 return false;
941
942 const DebugLoc &DL = I.getDebugLoc();
943 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
944 .addReg(Src0Reg)
945 .addReg(Src1Reg)
946 .addImm(SubReg);
947
948 I.eraseFromParent();
949 return true;
950}
951
952bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
953 Register DstReg = MI.getOperand(0).getReg();
954 Register SrcReg = MI.getOperand(1).getReg();
955 Register OffsetReg = MI.getOperand(2).getReg();
956 Register WidthReg = MI.getOperand(3).getReg();
957
958 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
959 "scalar BFX instructions are expanded in regbankselect");
960 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
961 "64-bit vector BFX instructions are expanded in regbankselect");
962
963 const DebugLoc &DL = MI.getDebugLoc();
964 MachineBasicBlock *MBB = MI.getParent();
965
966 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
967 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
968 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
969 .addReg(SrcReg)
970 .addReg(OffsetReg)
971 .addReg(WidthReg);
972 MI.eraseFromParent();
973 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
974}
975
976bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
977 if (STI.getLDSBankCount() != 16)
978 return selectImpl(MI, *CoverageInfo);
979
980 Register Dst = MI.getOperand(0).getReg();
981 Register Src0 = MI.getOperand(2).getReg();
982 Register M0Val = MI.getOperand(6).getReg();
983 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
984 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
985 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
986 return false;
987
988 // This requires 2 instructions. It is possible to write a pattern to support
989 // this, but the generated isel emitter doesn't correctly deal with multiple
990 // output instructions using the same physical register input. The copy to m0
991 // is incorrectly placed before the second instruction.
992 //
993 // TODO: Match source modifiers.
994
995 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
996 const DebugLoc &DL = MI.getDebugLoc();
997 MachineBasicBlock *MBB = MI.getParent();
998
999 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1000 .addReg(M0Val);
1001 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
1002 .addImm(2)
1003 .addImm(MI.getOperand(4).getImm()) // $attr
1004 .addImm(MI.getOperand(3).getImm()); // $attrchan
1005
1006 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
1007 .addImm(0) // $src0_modifiers
1008 .addReg(Src0) // $src0
1009 .addImm(MI.getOperand(4).getImm()) // $attr
1010 .addImm(MI.getOperand(3).getImm()) // $attrchan
1011 .addImm(0) // $src2_modifiers
1012 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1013 .addImm(MI.getOperand(5).getImm()) // $high
1014 .addImm(0) // $clamp
1015 .addImm(0); // $omod
1016
1017 MI.eraseFromParent();
1018 return true;
1019}
1020
1021// Writelane is special in that it can use SGPR and M0 (which would normally
1022// count as using the constant bus twice - but in this case it is allowed since
1023// the lane selector doesn't count as a use of the constant bus). However, it is
1024// still required to abide by the 1 SGPR rule. Fix this up if we might have
1025// multiple SGPRs.
1026bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1027 // With a constant bus limit of at least 2, there's no issue.
1028 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1029 return selectImpl(MI, *CoverageInfo);
1030
1031 MachineBasicBlock *MBB = MI.getParent();
1032 const DebugLoc &DL = MI.getDebugLoc();
1033 Register VDst = MI.getOperand(0).getReg();
1034 Register Val = MI.getOperand(2).getReg();
1035 Register LaneSelect = MI.getOperand(3).getReg();
1036 Register VDstIn = MI.getOperand(4).getReg();
1037
1038 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1039
1040 std::optional<ValueAndVReg> ConstSelect =
1041 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1042 if (ConstSelect) {
1043 // The selector has to be an inline immediate, so we can use whatever for
1044 // the other operands.
1045 MIB.addReg(Val);
1046 MIB.addImm(ConstSelect->Value.getSExtValue() &
1047 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1048 } else {
1049 std::optional<ValueAndVReg> ConstVal =
1051
1052 // If the value written is an inline immediate, we can get away without a
1053 // copy to m0.
1054 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1055 STI.hasInv2PiInlineImm())) {
1056 MIB.addImm(ConstVal->Value.getSExtValue());
1057 MIB.addReg(LaneSelect);
1058 } else {
1059 MIB.addReg(Val);
1060
1061 // If the lane selector was originally in a VGPR and copied with
1062 // readfirstlane, there's a hazard to read the same SGPR from the
1063 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1064 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1065
1066 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1067 .addReg(LaneSelect);
1068 MIB.addReg(AMDGPU::M0);
1069 }
1070 }
1071
1072 MIB.addReg(VDstIn);
1073
1074 MI.eraseFromParent();
1075 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1076}
1077
1078// We need to handle this here because tablegen doesn't support matching
1079// instructions with multiple outputs.
1080bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1081 Register Dst0 = MI.getOperand(0).getReg();
1082 Register Dst1 = MI.getOperand(1).getReg();
1083
1084 LLT Ty = MRI->getType(Dst0);
1085 unsigned Opc;
1086 if (Ty == LLT::scalar(32))
1087 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1088 else if (Ty == LLT::scalar(64))
1089 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1090 else
1091 return false;
1092
1093 // TODO: Match source modifiers.
1094
1095 const DebugLoc &DL = MI.getDebugLoc();
1096 MachineBasicBlock *MBB = MI.getParent();
1097
1098 Register Numer = MI.getOperand(3).getReg();
1099 Register Denom = MI.getOperand(4).getReg();
1100 unsigned ChooseDenom = MI.getOperand(5).getImm();
1101
1102 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1103
1104 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1105 .addDef(Dst1)
1106 .addImm(0) // $src0_modifiers
1107 .addUse(Src0) // $src0
1108 .addImm(0) // $src1_modifiers
1109 .addUse(Denom) // $src1
1110 .addImm(0) // $src2_modifiers
1111 .addUse(Numer) // $src2
1112 .addImm(0) // $clamp
1113 .addImm(0); // $omod
1114
1115 MI.eraseFromParent();
1116 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1117}
1118
1119bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1120 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1121 switch (IntrinsicID) {
1122 case Intrinsic::amdgcn_if_break: {
1123 MachineBasicBlock *BB = I.getParent();
1124
1125 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1126 // SelectionDAG uses for wave32 vs wave64.
1127 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1128 .add(I.getOperand(0))
1129 .add(I.getOperand(2))
1130 .add(I.getOperand(3));
1131
1132 Register DstReg = I.getOperand(0).getReg();
1133 Register Src0Reg = I.getOperand(2).getReg();
1134 Register Src1Reg = I.getOperand(3).getReg();
1135
1136 I.eraseFromParent();
1137
1138 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1139 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1140
1141 return true;
1142 }
1143 case Intrinsic::amdgcn_interp_p1_f16:
1144 return selectInterpP1F16(I);
1145 case Intrinsic::amdgcn_wqm:
1146 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1147 case Intrinsic::amdgcn_softwqm:
1148 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1149 case Intrinsic::amdgcn_strict_wwm:
1150 case Intrinsic::amdgcn_wwm:
1151 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1152 case Intrinsic::amdgcn_strict_wqm:
1153 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1154 case Intrinsic::amdgcn_writelane:
1155 return selectWritelane(I);
1156 case Intrinsic::amdgcn_div_scale:
1157 return selectDivScale(I);
1158 case Intrinsic::amdgcn_icmp:
1159 case Intrinsic::amdgcn_fcmp:
1160 if (selectImpl(I, *CoverageInfo))
1161 return true;
1162 return selectIntrinsicCmp(I);
1163 case Intrinsic::amdgcn_ballot:
1164 return selectBallot(I);
1165 case Intrinsic::amdgcn_reloc_constant:
1166 return selectRelocConstant(I);
1167 case Intrinsic::amdgcn_groupstaticsize:
1168 return selectGroupStaticSize(I);
1169 case Intrinsic::returnaddress:
1170 return selectReturnAddress(I);
1171 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1172 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1173 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1174 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1175 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1176 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1177 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1178 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1179 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1180 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1181 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1182 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1183 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1184 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1185 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1186 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1187 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1188 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1189 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1190 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1191 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1192 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1193 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1194 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1195 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1196 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1197 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1198 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1199 return selectSMFMACIntrin(I);
1200 case Intrinsic::amdgcn_permlane16_swap:
1201 case Intrinsic::amdgcn_permlane32_swap:
1202 return selectPermlaneSwapIntrin(I, IntrinsicID);
1203 default:
1204 return selectImpl(I, *CoverageInfo);
1205 }
1206}
1207
1209 const GCNSubtarget &ST) {
1210 if (Size != 16 && Size != 32 && Size != 64)
1211 return -1;
1212
1213 if (Size == 16 && !ST.has16BitInsts())
1214 return -1;
1215
1216 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1217 unsigned FakeS16Opc, unsigned S32Opc,
1218 unsigned S64Opc) {
1219 if (Size == 16)
1220 return ST.hasTrue16BitInsts()
1221 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1222 : S16Opc;
1223 if (Size == 32)
1224 return S32Opc;
1225 return S64Opc;
1226 };
1227
1228 switch (P) {
1229 default:
1230 llvm_unreachable("Unknown condition code!");
1231 case CmpInst::ICMP_NE:
1232 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1233 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1234 AMDGPU::V_CMP_NE_U64_e64);
1235 case CmpInst::ICMP_EQ:
1236 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1237 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1238 AMDGPU::V_CMP_EQ_U64_e64);
1239 case CmpInst::ICMP_SGT:
1240 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1241 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1242 AMDGPU::V_CMP_GT_I64_e64);
1243 case CmpInst::ICMP_SGE:
1244 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1245 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1246 AMDGPU::V_CMP_GE_I64_e64);
1247 case CmpInst::ICMP_SLT:
1248 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1249 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1250 AMDGPU::V_CMP_LT_I64_e64);
1251 case CmpInst::ICMP_SLE:
1252 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1253 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1254 AMDGPU::V_CMP_LE_I64_e64);
1255 case CmpInst::ICMP_UGT:
1256 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1257 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1258 AMDGPU::V_CMP_GT_U64_e64);
1259 case CmpInst::ICMP_UGE:
1260 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1261 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1262 AMDGPU::V_CMP_GE_U64_e64);
1263 case CmpInst::ICMP_ULT:
1264 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1265 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1266 AMDGPU::V_CMP_LT_U64_e64);
1267 case CmpInst::ICMP_ULE:
1268 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1269 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1270 AMDGPU::V_CMP_LE_U64_e64);
1271
1272 case CmpInst::FCMP_OEQ:
1273 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1274 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1275 AMDGPU::V_CMP_EQ_F64_e64);
1276 case CmpInst::FCMP_OGT:
1277 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1278 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1279 AMDGPU::V_CMP_GT_F64_e64);
1280 case CmpInst::FCMP_OGE:
1281 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1282 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1283 AMDGPU::V_CMP_GE_F64_e64);
1284 case CmpInst::FCMP_OLT:
1285 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1286 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1287 AMDGPU::V_CMP_LT_F64_e64);
1288 case CmpInst::FCMP_OLE:
1289 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1290 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1291 AMDGPU::V_CMP_LE_F64_e64);
1292 case CmpInst::FCMP_ONE:
1293 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1294 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1295 AMDGPU::V_CMP_NEQ_F64_e64);
1296 case CmpInst::FCMP_ORD:
1297 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1298 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1299 AMDGPU::V_CMP_O_F64_e64);
1300 case CmpInst::FCMP_UNO:
1301 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1302 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1303 AMDGPU::V_CMP_U_F64_e64);
1304 case CmpInst::FCMP_UEQ:
1305 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1306 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1307 AMDGPU::V_CMP_NLG_F64_e64);
1308 case CmpInst::FCMP_UGT:
1309 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1310 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1311 AMDGPU::V_CMP_NLE_F64_e64);
1312 case CmpInst::FCMP_UGE:
1313 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1314 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1315 AMDGPU::V_CMP_NLT_F64_e64);
1316 case CmpInst::FCMP_ULT:
1317 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1318 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1319 AMDGPU::V_CMP_NGE_F64_e64);
1320 case CmpInst::FCMP_ULE:
1321 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1322 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1323 AMDGPU::V_CMP_NGT_F64_e64);
1324 case CmpInst::FCMP_UNE:
1325 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1326 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1327 AMDGPU::V_CMP_NEQ_F64_e64);
1328 case CmpInst::FCMP_TRUE:
1329 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1330 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1331 AMDGPU::V_CMP_TRU_F64_e64);
1333 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1334 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1335 AMDGPU::V_CMP_F_F64_e64);
1336 }
1337}
1338
1339int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1340 unsigned Size) const {
1341 if (Size == 64) {
1342 if (!STI.hasScalarCompareEq64())
1343 return -1;
1344
1345 switch (P) {
1346 case CmpInst::ICMP_NE:
1347 return AMDGPU::S_CMP_LG_U64;
1348 case CmpInst::ICMP_EQ:
1349 return AMDGPU::S_CMP_EQ_U64;
1350 default:
1351 return -1;
1352 }
1353 }
1354
1355 if (Size == 32) {
1356 switch (P) {
1357 case CmpInst::ICMP_NE:
1358 return AMDGPU::S_CMP_LG_U32;
1359 case CmpInst::ICMP_EQ:
1360 return AMDGPU::S_CMP_EQ_U32;
1361 case CmpInst::ICMP_SGT:
1362 return AMDGPU::S_CMP_GT_I32;
1363 case CmpInst::ICMP_SGE:
1364 return AMDGPU::S_CMP_GE_I32;
1365 case CmpInst::ICMP_SLT:
1366 return AMDGPU::S_CMP_LT_I32;
1367 case CmpInst::ICMP_SLE:
1368 return AMDGPU::S_CMP_LE_I32;
1369 case CmpInst::ICMP_UGT:
1370 return AMDGPU::S_CMP_GT_U32;
1371 case CmpInst::ICMP_UGE:
1372 return AMDGPU::S_CMP_GE_U32;
1373 case CmpInst::ICMP_ULT:
1374 return AMDGPU::S_CMP_LT_U32;
1375 case CmpInst::ICMP_ULE:
1376 return AMDGPU::S_CMP_LE_U32;
1377 case CmpInst::FCMP_OEQ:
1378 return AMDGPU::S_CMP_EQ_F32;
1379 case CmpInst::FCMP_OGT:
1380 return AMDGPU::S_CMP_GT_F32;
1381 case CmpInst::FCMP_OGE:
1382 return AMDGPU::S_CMP_GE_F32;
1383 case CmpInst::FCMP_OLT:
1384 return AMDGPU::S_CMP_LT_F32;
1385 case CmpInst::FCMP_OLE:
1386 return AMDGPU::S_CMP_LE_F32;
1387 case CmpInst::FCMP_ONE:
1388 return AMDGPU::S_CMP_LG_F32;
1389 case CmpInst::FCMP_ORD:
1390 return AMDGPU::S_CMP_O_F32;
1391 case CmpInst::FCMP_UNO:
1392 return AMDGPU::S_CMP_U_F32;
1393 case CmpInst::FCMP_UEQ:
1394 return AMDGPU::S_CMP_NLG_F32;
1395 case CmpInst::FCMP_UGT:
1396 return AMDGPU::S_CMP_NLE_F32;
1397 case CmpInst::FCMP_UGE:
1398 return AMDGPU::S_CMP_NLT_F32;
1399 case CmpInst::FCMP_ULT:
1400 return AMDGPU::S_CMP_NGE_F32;
1401 case CmpInst::FCMP_ULE:
1402 return AMDGPU::S_CMP_NGT_F32;
1403 case CmpInst::FCMP_UNE:
1404 return AMDGPU::S_CMP_NEQ_F32;
1405 default:
1406 llvm_unreachable("Unknown condition code!");
1407 }
1408 }
1409
1410 if (Size == 16) {
1411 if (!STI.hasSALUFloatInsts())
1412 return -1;
1413
1414 switch (P) {
1415 case CmpInst::FCMP_OEQ:
1416 return AMDGPU::S_CMP_EQ_F16;
1417 case CmpInst::FCMP_OGT:
1418 return AMDGPU::S_CMP_GT_F16;
1419 case CmpInst::FCMP_OGE:
1420 return AMDGPU::S_CMP_GE_F16;
1421 case CmpInst::FCMP_OLT:
1422 return AMDGPU::S_CMP_LT_F16;
1423 case CmpInst::FCMP_OLE:
1424 return AMDGPU::S_CMP_LE_F16;
1425 case CmpInst::FCMP_ONE:
1426 return AMDGPU::S_CMP_LG_F16;
1427 case CmpInst::FCMP_ORD:
1428 return AMDGPU::S_CMP_O_F16;
1429 case CmpInst::FCMP_UNO:
1430 return AMDGPU::S_CMP_U_F16;
1431 case CmpInst::FCMP_UEQ:
1432 return AMDGPU::S_CMP_NLG_F16;
1433 case CmpInst::FCMP_UGT:
1434 return AMDGPU::S_CMP_NLE_F16;
1435 case CmpInst::FCMP_UGE:
1436 return AMDGPU::S_CMP_NLT_F16;
1437 case CmpInst::FCMP_ULT:
1438 return AMDGPU::S_CMP_NGE_F16;
1439 case CmpInst::FCMP_ULE:
1440 return AMDGPU::S_CMP_NGT_F16;
1441 case CmpInst::FCMP_UNE:
1442 return AMDGPU::S_CMP_NEQ_F16;
1443 default:
1444 llvm_unreachable("Unknown condition code!");
1445 }
1446 }
1447
1448 return -1;
1449}
1450
1451bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1452
1453 MachineBasicBlock *BB = I.getParent();
1454 const DebugLoc &DL = I.getDebugLoc();
1455
1456 Register SrcReg = I.getOperand(2).getReg();
1457 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1458
1459 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1460
1461 Register CCReg = I.getOperand(0).getReg();
1462 if (!isVCC(CCReg, *MRI)) {
1463 int Opcode = getS_CMPOpcode(Pred, Size);
1464 if (Opcode == -1)
1465 return false;
1466 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1467 .add(I.getOperand(2))
1468 .add(I.getOperand(3));
1469 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1470 .addReg(AMDGPU::SCC);
1471 bool Ret =
1472 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1473 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1474 I.eraseFromParent();
1475 return Ret;
1476 }
1477
1478 if (I.getOpcode() == AMDGPU::G_FCMP)
1479 return false;
1480
1481 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1482 if (Opcode == -1)
1483 return false;
1484
1486 // t16 instructions
1487 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1488 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1489 .addImm(0)
1490 .add(I.getOperand(2))
1491 .addImm(0)
1492 .add(I.getOperand(3))
1493 .addImm(0); // op_sel
1494 } else {
1495 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1496 .add(I.getOperand(2))
1497 .add(I.getOperand(3));
1498 }
1499
1501 *TRI.getBoolRC(), *MRI);
1502 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1503 I.eraseFromParent();
1504 return Ret;
1505}
1506
1507bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1508 Register Dst = I.getOperand(0).getReg();
1509 if (isVCC(Dst, *MRI))
1510 return false;
1511
1512 LLT DstTy = MRI->getType(Dst);
1513 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1514 return false;
1515
1516 MachineBasicBlock *BB = I.getParent();
1517 const DebugLoc &DL = I.getDebugLoc();
1518 Register SrcReg = I.getOperand(2).getReg();
1519 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1520
1521 // i1 inputs are not supported in GlobalISel.
1522 if (Size == 1)
1523 return false;
1524
1525 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1526 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1527 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1528 I.eraseFromParent();
1529 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1530 }
1531
1532 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1533 if (Opcode == -1)
1534 return false;
1535
1536 MachineInstrBuilder SelectedMI;
1537 MachineOperand &LHS = I.getOperand(2);
1538 MachineOperand &RHS = I.getOperand(3);
1539 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1540 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1541 Register Src0Reg =
1542 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1543 Register Src1Reg =
1544 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1545 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1546 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1547 SelectedMI.addImm(Src0Mods);
1548 SelectedMI.addReg(Src0Reg);
1549 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1550 SelectedMI.addImm(Src1Mods);
1551 SelectedMI.addReg(Src1Reg);
1552 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1553 SelectedMI.addImm(0); // clamp
1554 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1555 SelectedMI.addImm(0); // op_sel
1556
1557 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1558 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1559 return false;
1560
1561 I.eraseFromParent();
1562 return true;
1563}
1564
1565// Ballot has to zero bits in input lane-mask that are zero in current exec,
1566// Done as AND with exec. For inputs that are results of instruction that
1567// implicitly use same exec, for example compares in same basic block or SCC to
1568// VCC copy, use copy.
1571 MachineInstr *MI = MRI.getVRegDef(Reg);
1572 if (MI->getParent() != MBB)
1573 return false;
1574
1575 // Lane mask generated by SCC to VCC copy.
1576 if (MI->getOpcode() == AMDGPU::COPY) {
1577 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1578 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1579 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1580 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1581 return true;
1582 }
1583
1584 // Lane mask generated using compare with same exec.
1585 if (isa<GAnyCmp>(MI))
1586 return true;
1587
1588 Register LHS, RHS;
1589 // Look through AND.
1590 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1591 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1593
1594 return false;
1595}
1596
1597bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1598 MachineBasicBlock *BB = I.getParent();
1599 const DebugLoc &DL = I.getDebugLoc();
1600 Register DstReg = I.getOperand(0).getReg();
1601 Register SrcReg = I.getOperand(2).getReg();
1602 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1603 const unsigned WaveSize = STI.getWavefrontSize();
1604
1605 // In the common case, the return type matches the wave size.
1606 // However we also support emitting i64 ballots in wave32 mode.
1607 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1608 return false;
1609
1610 std::optional<ValueAndVReg> Arg =
1612
1613 Register Dst = DstReg;
1614 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1615 if (BallotSize != WaveSize) {
1616 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1617 }
1618
1619 if (Arg) {
1620 const int64_t Value = Arg->Value.getZExtValue();
1621 if (Value == 0) {
1622 // Dst = S_MOV 0
1623 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1624 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1625 } else {
1626 // Dst = COPY EXEC
1627 assert(Value == 1);
1628 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1629 }
1630 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1631 return false;
1632 } else {
1633 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1634 // Dst = COPY SrcReg
1635 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1636 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1637 return false;
1638 } else {
1639 // Dst = S_AND SrcReg, EXEC
1640 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1641 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1642 .addReg(SrcReg)
1643 .addReg(TRI.getExec())
1644 .setOperandDead(3); // Dead scc
1645 if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
1646 return false;
1647 }
1648 }
1649
1650 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1651 if (BallotSize != WaveSize) {
1652 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1653 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1654 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1655 .addReg(Dst)
1656 .addImm(AMDGPU::sub0)
1657 .addReg(HiReg)
1658 .addImm(AMDGPU::sub1);
1659 }
1660
1661 I.eraseFromParent();
1662 return true;
1663}
1664
1665bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1666 Register DstReg = I.getOperand(0).getReg();
1667 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1668 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1669 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1670 return false;
1671
1672 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1673
1675 const MDNode *Metadata = I.getOperand(2).getMetadata();
1676 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1677 auto *RelocSymbol = cast<GlobalVariable>(
1678 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1679
1680 MachineBasicBlock *BB = I.getParent();
1681 BuildMI(*BB, &I, I.getDebugLoc(),
1682 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1684
1685 I.eraseFromParent();
1686 return true;
1687}
1688
1689bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1691
1692 Register DstReg = I.getOperand(0).getReg();
1693 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1694 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1695 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1696
1697 MachineBasicBlock *MBB = I.getParent();
1698 const DebugLoc &DL = I.getDebugLoc();
1699
1700 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1701
1702 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1704 MIB.addImm(MFI->getLDSSize());
1705 } else {
1707 const GlobalValue *GV =
1708 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1710 }
1711
1712 I.eraseFromParent();
1713 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1714}
1715
1716bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1717 MachineBasicBlock *MBB = I.getParent();
1719 const DebugLoc &DL = I.getDebugLoc();
1720
1721 MachineOperand &Dst = I.getOperand(0);
1722 Register DstReg = Dst.getReg();
1723 unsigned Depth = I.getOperand(2).getImm();
1724
1725 const TargetRegisterClass *RC
1726 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1727 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1728 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1729 return false;
1730
1731 // Check for kernel and shader functions
1732 if (Depth != 0 ||
1734 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1735 .addImm(0);
1736 I.eraseFromParent();
1737 return true;
1738 }
1739
1741 // There is a call to @llvm.returnaddress in this function
1742 MFI.setReturnAddressIsTaken(true);
1743
1744 // Get the return address reg and mark it as an implicit live-in
1745 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1746 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1747 AMDGPU::SReg_64RegClass, DL);
1748 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1749 .addReg(LiveIn);
1750 I.eraseFromParent();
1751 return true;
1752}
1753
1754bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1755 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1756 // SelectionDAG uses for wave32 vs wave64.
1757 MachineBasicBlock *BB = MI.getParent();
1758 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1759 .add(MI.getOperand(1));
1760
1761 Register Reg = MI.getOperand(1).getReg();
1762 MI.eraseFromParent();
1763
1764 if (!MRI->getRegClassOrNull(Reg))
1765 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1766 return true;
1767}
1768
1769bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1770 MachineInstr &MI, Intrinsic::ID IntrID) const {
1771 MachineBasicBlock *MBB = MI.getParent();
1773 const DebugLoc &DL = MI.getDebugLoc();
1774
1775 unsigned IndexOperand = MI.getOperand(7).getImm();
1776 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1777 bool WaveDone = MI.getOperand(9).getImm() != 0;
1778
1779 if (WaveDone && !WaveRelease) {
1780 // TODO: Move this to IR verifier
1781 const Function &Fn = MF->getFunction();
1783 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1784 }
1785
1786 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1787 IndexOperand &= ~0x3f;
1788 unsigned CountDw = 0;
1789
1791 CountDw = (IndexOperand >> 24) & 0xf;
1792 IndexOperand &= ~(0xf << 24);
1793
1794 if (CountDw < 1 || CountDw > 4) {
1795 const Function &Fn = MF->getFunction();
1797 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1798 CountDw = 1;
1799 }
1800 }
1801
1802 if (IndexOperand) {
1803 const Function &Fn = MF->getFunction();
1805 Fn, "ds_ordered_count: bad index operand", DL));
1806 }
1807
1808 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1809 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1810
1811 unsigned Offset0 = OrderedCountIndex << 2;
1812 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1813
1815 Offset1 |= (CountDw - 1) << 6;
1816
1818 Offset1 |= ShaderType << 2;
1819
1820 unsigned Offset = Offset0 | (Offset1 << 8);
1821
1822 Register M0Val = MI.getOperand(2).getReg();
1823 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1824 .addReg(M0Val);
1825
1826 Register DstReg = MI.getOperand(0).getReg();
1827 Register ValReg = MI.getOperand(3).getReg();
1829 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1830 .addReg(ValReg)
1831 .addImm(Offset)
1832 .cloneMemRefs(MI);
1833
1834 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1835 return false;
1836
1837 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1838 MI.eraseFromParent();
1839 return Ret;
1840}
1841
1842static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1843 switch (IntrID) {
1844 case Intrinsic::amdgcn_ds_gws_init:
1845 return AMDGPU::DS_GWS_INIT;
1846 case Intrinsic::amdgcn_ds_gws_barrier:
1847 return AMDGPU::DS_GWS_BARRIER;
1848 case Intrinsic::amdgcn_ds_gws_sema_v:
1849 return AMDGPU::DS_GWS_SEMA_V;
1850 case Intrinsic::amdgcn_ds_gws_sema_br:
1851 return AMDGPU::DS_GWS_SEMA_BR;
1852 case Intrinsic::amdgcn_ds_gws_sema_p:
1853 return AMDGPU::DS_GWS_SEMA_P;
1854 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1855 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1856 default:
1857 llvm_unreachable("not a gws intrinsic");
1858 }
1859}
1860
1861bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1862 Intrinsic::ID IID) const {
1863 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1864 !STI.hasGWSSemaReleaseAll()))
1865 return false;
1866
1867 // intrinsic ID, vsrc, offset
1868 const bool HasVSrc = MI.getNumOperands() == 3;
1869 assert(HasVSrc || MI.getNumOperands() == 2);
1870
1871 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1872 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1873 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1874 return false;
1875
1876 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1877 unsigned ImmOffset;
1878
1879 MachineBasicBlock *MBB = MI.getParent();
1880 const DebugLoc &DL = MI.getDebugLoc();
1881
1882 MachineInstr *Readfirstlane = nullptr;
1883
1884 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1885 // incoming offset, in case there's an add of a constant. We'll have to put it
1886 // back later.
1887 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1888 Readfirstlane = OffsetDef;
1889 BaseOffset = OffsetDef->getOperand(1).getReg();
1890 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1891 }
1892
1893 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1894 // If we have a constant offset, try to use the 0 in m0 as the base.
1895 // TODO: Look into changing the default m0 initialization value. If the
1896 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1897 // the immediate offset.
1898
1899 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1900 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1901 .addImm(0);
1902 } else {
1903 std::tie(BaseOffset, ImmOffset) =
1904 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
1905
1906 if (Readfirstlane) {
1907 // We have the constant offset now, so put the readfirstlane back on the
1908 // variable component.
1909 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1910 return false;
1911
1912 Readfirstlane->getOperand(1).setReg(BaseOffset);
1913 BaseOffset = Readfirstlane->getOperand(0).getReg();
1914 } else {
1915 if (!RBI.constrainGenericRegister(BaseOffset,
1916 AMDGPU::SReg_32RegClass, *MRI))
1917 return false;
1918 }
1919
1920 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1921 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1922 .addReg(BaseOffset)
1923 .addImm(16)
1924 .setOperandDead(3); // Dead scc
1925
1926 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1927 .addReg(M0Base);
1928 }
1929
1930 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1931 // offset field) % 64. Some versions of the programming guide omit the m0
1932 // part, or claim it's from offset 0.
1933 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1934
1935 if (HasVSrc) {
1936 Register VSrc = MI.getOperand(1).getReg();
1937 MIB.addReg(VSrc);
1938
1939 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1940 return false;
1941 }
1942
1943 MIB.addImm(ImmOffset)
1944 .cloneMemRefs(MI);
1945
1946 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1947
1948 MI.eraseFromParent();
1949 return true;
1950}
1951
1952bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1953 bool IsAppend) const {
1954 Register PtrBase = MI.getOperand(2).getReg();
1955 LLT PtrTy = MRI->getType(PtrBase);
1956 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1957
1958 unsigned Offset;
1959 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1960
1961 // TODO: Should this try to look through readfirstlane like GWS?
1962 if (!isDSOffsetLegal(PtrBase, Offset)) {
1963 PtrBase = MI.getOperand(2).getReg();
1964 Offset = 0;
1965 }
1966
1967 MachineBasicBlock *MBB = MI.getParent();
1968 const DebugLoc &DL = MI.getDebugLoc();
1969 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1970
1971 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1972 .addReg(PtrBase);
1973 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1974 return false;
1975
1976 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1977 .addImm(Offset)
1978 .addImm(IsGDS ? -1 : 0)
1979 .cloneMemRefs(MI);
1980 MI.eraseFromParent();
1981 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1982}
1983
1984bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
1985 MachineFunction *MF = MI.getParent()->getParent();
1987
1988 MFInfo->setInitWholeWave();
1989 return selectImpl(MI, *CoverageInfo);
1990}
1991
1992static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1993 bool &IsTexFail) {
1994 if (TexFailCtrl)
1995 IsTexFail = true;
1996
1997 TFE = TexFailCtrl & 0x1;
1998 TexFailCtrl &= ~(uint64_t)0x1;
1999 LWE = TexFailCtrl & 0x2;
2000 TexFailCtrl &= ~(uint64_t)0x2;
2001
2002 return TexFailCtrl == 0;
2003}
2004
2005bool AMDGPUInstructionSelector::selectImageIntrinsic(
2007 MachineBasicBlock *MBB = MI.getParent();
2008 const DebugLoc &DL = MI.getDebugLoc();
2009
2010 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2012
2013 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2014 unsigned IntrOpcode = Intr->BaseOpcode;
2015 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2016 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2017 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2018
2019 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2020
2021 Register VDataIn, VDataOut;
2022 LLT VDataTy;
2023 int NumVDataDwords = -1;
2024 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2025 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2026
2027 bool Unorm;
2028 if (!BaseOpcode->Sampler)
2029 Unorm = true;
2030 else
2031 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2032
2033 bool TFE;
2034 bool LWE;
2035 bool IsTexFail = false;
2036 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2037 TFE, LWE, IsTexFail))
2038 return false;
2039
2040 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2041 const bool IsA16 = (Flags & 1) != 0;
2042 const bool IsG16 = (Flags & 2) != 0;
2043
2044 // A16 implies 16 bit gradients if subtarget doesn't support G16
2045 if (IsA16 && !STI.hasG16() && !IsG16)
2046 return false;
2047
2048 unsigned DMask = 0;
2049 unsigned DMaskLanes = 0;
2050
2051 if (BaseOpcode->Atomic) {
2052 VDataOut = MI.getOperand(0).getReg();
2053 VDataIn = MI.getOperand(2).getReg();
2054 LLT Ty = MRI->getType(VDataIn);
2055
2056 // Be careful to allow atomic swap on 16-bit element vectors.
2057 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2058 Ty.getSizeInBits() == 128 :
2059 Ty.getSizeInBits() == 64;
2060
2061 if (BaseOpcode->AtomicX2) {
2062 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2063
2064 DMask = Is64Bit ? 0xf : 0x3;
2065 NumVDataDwords = Is64Bit ? 4 : 2;
2066 } else {
2067 DMask = Is64Bit ? 0x3 : 0x1;
2068 NumVDataDwords = Is64Bit ? 2 : 1;
2069 }
2070 } else {
2071 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2072 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2073
2074 if (BaseOpcode->Store) {
2075 VDataIn = MI.getOperand(1).getReg();
2076 VDataTy = MRI->getType(VDataIn);
2077 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2078 } else if (BaseOpcode->NoReturn) {
2079 NumVDataDwords = 0;
2080 } else {
2081 VDataOut = MI.getOperand(0).getReg();
2082 VDataTy = MRI->getType(VDataOut);
2083 NumVDataDwords = DMaskLanes;
2084
2085 if (IsD16 && !STI.hasUnpackedD16VMem())
2086 NumVDataDwords = (DMaskLanes + 1) / 2;
2087 }
2088 }
2089
2090 // Set G16 opcode
2091 if (Subtarget->hasG16() && IsG16) {
2092 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2094 assert(G16MappingInfo);
2095 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2096 }
2097
2098 // TODO: Check this in verifier.
2099 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2100
2101 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2102 if (BaseOpcode->Atomic)
2103 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
2104 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2106 return false;
2107
2108 int NumVAddrRegs = 0;
2109 int NumVAddrDwords = 0;
2110 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2111 // Skip the $noregs and 0s inserted during legalization.
2112 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2113 if (!AddrOp.isReg())
2114 continue; // XXX - Break?
2115
2116 Register Addr = AddrOp.getReg();
2117 if (!Addr)
2118 break;
2119
2120 ++NumVAddrRegs;
2121 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2122 }
2123
2124 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2125 // NSA, these should have been packed into a single value in the first
2126 // address register
2127 const bool UseNSA =
2128 NumVAddrRegs != 1 &&
2129 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2130 : NumVAddrDwords == NumVAddrRegs);
2131 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2132 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2133 return false;
2134 }
2135
2136 if (IsTexFail)
2137 ++NumVDataDwords;
2138
2139 int Opcode = -1;
2140 if (IsGFX12Plus) {
2141 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2142 NumVDataDwords, NumVAddrDwords);
2143 } else if (IsGFX11Plus) {
2144 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2145 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2146 : AMDGPU::MIMGEncGfx11Default,
2147 NumVDataDwords, NumVAddrDwords);
2148 } else if (IsGFX10Plus) {
2149 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2150 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2151 : AMDGPU::MIMGEncGfx10Default,
2152 NumVDataDwords, NumVAddrDwords);
2153 } else {
2154 if (Subtarget->hasGFX90AInsts()) {
2155 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2156 NumVDataDwords, NumVAddrDwords);
2157 if (Opcode == -1) {
2158 LLVM_DEBUG(
2159 dbgs()
2160 << "requested image instruction is not supported on this GPU\n");
2161 return false;
2162 }
2163 }
2164 if (Opcode == -1 &&
2166 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2167 NumVDataDwords, NumVAddrDwords);
2168 if (Opcode == -1)
2169 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2170 NumVDataDwords, NumVAddrDwords);
2171 }
2172 if (Opcode == -1)
2173 return false;
2174
2175 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2176 .cloneMemRefs(MI);
2177
2178 if (VDataOut) {
2179 if (BaseOpcode->AtomicX2) {
2180 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2181
2182 Register TmpReg = MRI->createVirtualRegister(
2183 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2184 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2185
2186 MIB.addDef(TmpReg);
2187 if (!MRI->use_empty(VDataOut)) {
2188 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2189 .addReg(TmpReg, RegState::Kill, SubReg);
2190 }
2191
2192 } else {
2193 MIB.addDef(VDataOut); // vdata output
2194 }
2195 }
2196
2197 if (VDataIn)
2198 MIB.addReg(VDataIn); // vdata input
2199
2200 for (int I = 0; I != NumVAddrRegs; ++I) {
2201 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2202 if (SrcOp.isReg()) {
2203 assert(SrcOp.getReg() != 0);
2204 MIB.addReg(SrcOp.getReg());
2205 }
2206 }
2207
2208 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2209 if (BaseOpcode->Sampler)
2210 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2211
2212 MIB.addImm(DMask); // dmask
2213
2214 if (IsGFX10Plus)
2215 MIB.addImm(DimInfo->Encoding);
2216 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2217 MIB.addImm(Unorm);
2218
2219 MIB.addImm(CPol);
2220 MIB.addImm(IsA16 && // a16 or r128
2221 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2222 if (IsGFX10Plus)
2223 MIB.addImm(IsA16 ? -1 : 0);
2224
2225 if (!Subtarget->hasGFX90AInsts()) {
2226 MIB.addImm(TFE); // tfe
2227 } else if (TFE) {
2228 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2229 return false;
2230 }
2231
2232 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2233 MIB.addImm(LWE); // lwe
2234 if (!IsGFX10Plus)
2235 MIB.addImm(DimInfo->DA ? -1 : 0);
2236 if (BaseOpcode->HasD16)
2237 MIB.addImm(IsD16 ? -1 : 0);
2238
2239 MI.eraseFromParent();
2240 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2241 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2242 return true;
2243}
2244
2245// We need to handle this here because tablegen doesn't support matching
2246// instructions with multiple outputs.
2247bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2248 MachineInstr &MI) const {
2249 Register Dst0 = MI.getOperand(0).getReg();
2250 Register Dst1 = MI.getOperand(1).getReg();
2251
2252 const DebugLoc &DL = MI.getDebugLoc();
2253 MachineBasicBlock *MBB = MI.getParent();
2254
2255 Register Addr = MI.getOperand(3).getReg();
2256 Register Data0 = MI.getOperand(4).getReg();
2257 Register Data1 = MI.getOperand(5).getReg();
2258 unsigned Offset = MI.getOperand(6).getImm();
2259
2260 unsigned Opc;
2261 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2262 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2263 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2264 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2265 break;
2266 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2267 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2268 break;
2269 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2270 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2271 break;
2272 }
2273
2274 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2275 .addDef(Dst1)
2276 .addUse(Addr)
2277 .addUse(Data0)
2278 .addUse(Data1)
2279 .addImm(Offset)
2280 .cloneMemRefs(MI);
2281
2282 MI.eraseFromParent();
2283 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2284}
2285
2286bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2287 MachineInstr &I) const {
2288 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2289 switch (IntrinsicID) {
2290 case Intrinsic::amdgcn_end_cf:
2291 return selectEndCfIntrinsic(I);
2292 case Intrinsic::amdgcn_ds_ordered_add:
2293 case Intrinsic::amdgcn_ds_ordered_swap:
2294 return selectDSOrderedIntrinsic(I, IntrinsicID);
2295 case Intrinsic::amdgcn_ds_gws_init:
2296 case Intrinsic::amdgcn_ds_gws_barrier:
2297 case Intrinsic::amdgcn_ds_gws_sema_v:
2298 case Intrinsic::amdgcn_ds_gws_sema_br:
2299 case Intrinsic::amdgcn_ds_gws_sema_p:
2300 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2301 return selectDSGWSIntrinsic(I, IntrinsicID);
2302 case Intrinsic::amdgcn_ds_append:
2303 return selectDSAppendConsume(I, true);
2304 case Intrinsic::amdgcn_ds_consume:
2305 return selectDSAppendConsume(I, false);
2306 case Intrinsic::amdgcn_init_whole_wave:
2307 return selectInitWholeWave(I);
2308 case Intrinsic::amdgcn_raw_buffer_load_lds:
2309 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2310 case Intrinsic::amdgcn_struct_buffer_load_lds:
2311 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2312 return selectBufferLoadLds(I);
2313 // Until we can store both the address space of the global and the LDS
2314 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2315 // that the argument is a global pointer (buffer pointers have been handled by
2316 // a LLVM IR-level lowering).
2317 case Intrinsic::amdgcn_load_to_lds:
2318 case Intrinsic::amdgcn_global_load_lds:
2319 return selectGlobalLoadLds(I);
2320 case Intrinsic::amdgcn_exp_compr:
2321 if (!STI.hasCompressedExport()) {
2322 Function &F = I.getMF()->getFunction();
2323 F.getContext().diagnose(
2324 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2325 I.getDebugLoc(), DS_Error));
2326 return false;
2327 }
2328 break;
2329 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2330 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2331 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2332 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2333 return selectDSBvhStackIntrinsic(I);
2334 case Intrinsic::amdgcn_s_barrier_init:
2335 case Intrinsic::amdgcn_s_barrier_signal_var:
2336 return selectNamedBarrierInit(I, IntrinsicID);
2337 case Intrinsic::amdgcn_s_barrier_join:
2338 case Intrinsic::amdgcn_s_get_named_barrier_state:
2339 return selectNamedBarrierInst(I, IntrinsicID);
2340 case Intrinsic::amdgcn_s_get_barrier_state:
2341 return selectSGetBarrierState(I, IntrinsicID);
2342 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2343 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2344 }
2345 return selectImpl(I, *CoverageInfo);
2346}
2347
2348bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2349 if (selectImpl(I, *CoverageInfo))
2350 return true;
2351
2352 MachineBasicBlock *BB = I.getParent();
2353 const DebugLoc &DL = I.getDebugLoc();
2354
2355 Register DstReg = I.getOperand(0).getReg();
2356 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2357 assert(Size <= 32 || Size == 64);
2358 const MachineOperand &CCOp = I.getOperand(1);
2359 Register CCReg = CCOp.getReg();
2360 if (!isVCC(CCReg, *MRI)) {
2361 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2362 AMDGPU::S_CSELECT_B32;
2363 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2364 .addReg(CCReg);
2365
2366 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2367 // bank, because it does not cover the register class that we used to represent
2368 // for it. So we need to manually set the register class here.
2369 if (!MRI->getRegClassOrNull(CCReg))
2370 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2371 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2372 .add(I.getOperand(2))
2373 .add(I.getOperand(3));
2374
2375 bool Ret = false;
2376 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2377 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2378 I.eraseFromParent();
2379 return Ret;
2380 }
2381
2382 // Wide VGPR select should have been split in RegBankSelect.
2383 if (Size > 32)
2384 return false;
2385
2387 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2388 .addImm(0)
2389 .add(I.getOperand(3))
2390 .addImm(0)
2391 .add(I.getOperand(2))
2392 .add(I.getOperand(1));
2393
2394 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2395 I.eraseFromParent();
2396 return Ret;
2397}
2398
2399bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2400 Register DstReg = I.getOperand(0).getReg();
2401 Register SrcReg = I.getOperand(1).getReg();
2402 const LLT DstTy = MRI->getType(DstReg);
2403 const LLT SrcTy = MRI->getType(SrcReg);
2404 const LLT S1 = LLT::scalar(1);
2405
2406 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2407 const RegisterBank *DstRB;
2408 if (DstTy == S1) {
2409 // This is a special case. We don't treat s1 for legalization artifacts as
2410 // vcc booleans.
2411 DstRB = SrcRB;
2412 } else {
2413 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2414 if (SrcRB != DstRB)
2415 return false;
2416 }
2417
2418 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2419
2420 unsigned DstSize = DstTy.getSizeInBits();
2421 unsigned SrcSize = SrcTy.getSizeInBits();
2422
2423 const TargetRegisterClass *SrcRC =
2424 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2425 const TargetRegisterClass *DstRC =
2426 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2427 if (!SrcRC || !DstRC)
2428 return false;
2429
2430 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2431 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2432 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2433 return false;
2434 }
2435
2436 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2438 const DebugLoc &DL = I.getDebugLoc();
2439 MachineBasicBlock *MBB = I.getParent();
2440 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2441 .addReg(SrcReg, 0, AMDGPU::lo16);
2442 I.eraseFromParent();
2443 return true;
2444 }
2445
2446 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2447 MachineBasicBlock *MBB = I.getParent();
2448 const DebugLoc &DL = I.getDebugLoc();
2449
2450 Register LoReg = MRI->createVirtualRegister(DstRC);
2451 Register HiReg = MRI->createVirtualRegister(DstRC);
2452 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2453 .addReg(SrcReg, 0, AMDGPU::sub0);
2454 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2455 .addReg(SrcReg, 0, AMDGPU::sub1);
2456
2457 if (IsVALU && STI.hasSDWA()) {
2458 // Write the low 16-bits of the high element into the high 16-bits of the
2459 // low element.
2460 MachineInstr *MovSDWA =
2461 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2462 .addImm(0) // $src0_modifiers
2463 .addReg(HiReg) // $src0
2464 .addImm(0) // $clamp
2465 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2466 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2467 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2468 .addReg(LoReg, RegState::Implicit);
2469 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2470 } else {
2471 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2472 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2473 Register ImmReg = MRI->createVirtualRegister(DstRC);
2474 if (IsVALU) {
2475 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2476 .addImm(16)
2477 .addReg(HiReg);
2478 } else {
2479 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2480 .addReg(HiReg)
2481 .addImm(16)
2482 .setOperandDead(3); // Dead scc
2483 }
2484
2485 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2486 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2487 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2488
2489 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2490 .addImm(0xffff);
2491 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2492 .addReg(LoReg)
2493 .addReg(ImmReg);
2494 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2495 .addReg(TmpReg0)
2496 .addReg(TmpReg1);
2497
2498 if (!IsVALU) {
2499 And.setOperandDead(3); // Dead scc
2500 Or.setOperandDead(3); // Dead scc
2501 }
2502 }
2503
2504 I.eraseFromParent();
2505 return true;
2506 }
2507
2508 if (!DstTy.isScalar())
2509 return false;
2510
2511 if (SrcSize > 32) {
2512 unsigned SubRegIdx = DstSize < 32
2513 ? static_cast<unsigned>(AMDGPU::sub0)
2514 : TRI.getSubRegFromChannel(0, DstSize / 32);
2515 if (SubRegIdx == AMDGPU::NoSubRegister)
2516 return false;
2517
2518 // Deal with weird cases where the class only partially supports the subreg
2519 // index.
2520 const TargetRegisterClass *SrcWithSubRC
2521 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2522 if (!SrcWithSubRC)
2523 return false;
2524
2525 if (SrcWithSubRC != SrcRC) {
2526 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2527 return false;
2528 }
2529
2530 I.getOperand(1).setSubReg(SubRegIdx);
2531 }
2532
2533 I.setDesc(TII.get(TargetOpcode::COPY));
2534 return true;
2535}
2536
2537/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2538static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2539 Mask = maskTrailingOnes<unsigned>(Size);
2540 int SignedMask = static_cast<int>(Mask);
2541 return SignedMask >= -16 && SignedMask <= 64;
2542}
2543
2544// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2545const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2546 Register Reg, const MachineRegisterInfo &MRI,
2547 const TargetRegisterInfo &TRI) const {
2548 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2549 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2550 return RB;
2551
2552 // Ignore the type, since we don't use vcc in artifacts.
2553 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2554 return &RBI.getRegBankFromRegClass(*RC, LLT());
2555 return nullptr;
2556}
2557
2558bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2559 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2560 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2561 const DebugLoc &DL = I.getDebugLoc();
2562 MachineBasicBlock &MBB = *I.getParent();
2563 const Register DstReg = I.getOperand(0).getReg();
2564 const Register SrcReg = I.getOperand(1).getReg();
2565
2566 const LLT DstTy = MRI->getType(DstReg);
2567 const LLT SrcTy = MRI->getType(SrcReg);
2568 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2569 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2570 const unsigned DstSize = DstTy.getSizeInBits();
2571 if (!DstTy.isScalar())
2572 return false;
2573
2574 // Artifact casts should never use vcc.
2575 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2576
2577 // FIXME: This should probably be illegal and split earlier.
2578 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2579 if (DstSize <= 32)
2580 return selectCOPY(I);
2581
2582 const TargetRegisterClass *SrcRC =
2583 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2584 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2585 const TargetRegisterClass *DstRC =
2586 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2587
2588 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2589 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2590 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2591 .addReg(SrcReg)
2592 .addImm(AMDGPU::sub0)
2593 .addReg(UndefReg)
2594 .addImm(AMDGPU::sub1);
2595 I.eraseFromParent();
2596
2597 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2598 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2599 }
2600
2601 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2602 // 64-bit should have been split up in RegBankSelect
2603
2604 // Try to use an and with a mask if it will save code size.
2605 unsigned Mask;
2606 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2607 MachineInstr *ExtI =
2608 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2609 .addImm(Mask)
2610 .addReg(SrcReg);
2611 I.eraseFromParent();
2612 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2613 }
2614
2615 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2616 MachineInstr *ExtI =
2617 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2618 .addReg(SrcReg)
2619 .addImm(0) // Offset
2620 .addImm(SrcSize); // Width
2621 I.eraseFromParent();
2622 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2623 }
2624
2625 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2626 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2627 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2628 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2629 return false;
2630
2631 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2632 const unsigned SextOpc = SrcSize == 8 ?
2633 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2634 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2635 .addReg(SrcReg);
2636 I.eraseFromParent();
2637 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2638 }
2639
2640 // Using a single 32-bit SALU to calculate the high half is smaller than
2641 // S_BFE with a literal constant operand.
2642 if (DstSize > 32 && SrcSize == 32) {
2643 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2644 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2645 if (Signed) {
2646 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2647 .addReg(SrcReg, 0, SubReg)
2648 .addImm(31)
2649 .setOperandDead(3); // Dead scc
2650 } else {
2651 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2652 .addImm(0);
2653 }
2654 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2655 .addReg(SrcReg, 0, SubReg)
2656 .addImm(AMDGPU::sub0)
2657 .addReg(HiReg)
2658 .addImm(AMDGPU::sub1);
2659 I.eraseFromParent();
2660 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2661 *MRI);
2662 }
2663
2664 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2665 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2666
2667 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2668 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2669 // We need a 64-bit register source, but the high bits don't matter.
2670 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2671 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2672 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2673
2674 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2675 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2676 .addReg(SrcReg, 0, SubReg)
2677 .addImm(AMDGPU::sub0)
2678 .addReg(UndefReg)
2679 .addImm(AMDGPU::sub1);
2680
2681 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2682 .addReg(ExtReg)
2683 .addImm(SrcSize << 16);
2684
2685 I.eraseFromParent();
2686 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2687 }
2688
2689 unsigned Mask;
2690 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2691 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2692 .addReg(SrcReg)
2693 .addImm(Mask)
2694 .setOperandDead(3); // Dead scc
2695 } else {
2696 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2697 .addReg(SrcReg)
2698 .addImm(SrcSize << 16);
2699 }
2700
2701 I.eraseFromParent();
2702 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2703 }
2704
2705 return false;
2706}
2707
2709 return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
2710}
2711
2713 Register BitcastSrc;
2714 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2715 Reg = BitcastSrc;
2716 return Reg;
2717}
2718
2720 Register &Out) {
2721 Register Trunc;
2722 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2723 return false;
2724
2725 Register LShlSrc;
2726 Register Cst;
2727 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2728 Cst = stripCopy(Cst, MRI);
2729 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2730 Out = stripBitCast(LShlSrc, MRI);
2731 return true;
2732 }
2733 }
2734
2735 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2736 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2737 return false;
2738
2739 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2740 LLT::fixed_vector(2, 16));
2741
2742 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2743 assert(Mask.size() == 2);
2744
2745 if (Mask[0] == 1 && Mask[1] <= 1) {
2746 Out = Shuffle->getOperand(0).getReg();
2747 return true;
2748 }
2749
2750 return false;
2751}
2752
2753bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2754 if (!Subtarget->hasSALUFloatInsts())
2755 return false;
2756
2757 Register Dst = I.getOperand(0).getReg();
2758 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2759 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2760 return false;
2761
2762 Register Src = I.getOperand(1).getReg();
2763
2764 if (MRI->getType(Dst) == LLT::scalar(32) &&
2765 MRI->getType(Src) == LLT::scalar(16)) {
2766 if (isExtractHiElt(*MRI, Src, Src)) {
2767 MachineBasicBlock *BB = I.getParent();
2768 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2769 .addUse(Src);
2770 I.eraseFromParent();
2771 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2772 }
2773 }
2774
2775 return false;
2776}
2777
2778bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2779 // Only manually handle the f64 SGPR case.
2780 //
2781 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2782 // the bit ops theoretically have a second result due to the implicit def of
2783 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2784 // that is easy by disabling the check. The result works, but uses a
2785 // nonsensical sreg32orlds_and_sreg_1 regclass.
2786 //
2787 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2788 // the variadic REG_SEQUENCE operands.
2789
2790 Register Dst = MI.getOperand(0).getReg();
2791 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2792 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2793 MRI->getType(Dst) != LLT::scalar(64))
2794 return false;
2795
2796 Register Src = MI.getOperand(1).getReg();
2797 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2798 if (Fabs)
2799 Src = Fabs->getOperand(1).getReg();
2800
2801 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2802 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2803 return false;
2804
2805 MachineBasicBlock *BB = MI.getParent();
2806 const DebugLoc &DL = MI.getDebugLoc();
2807 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2808 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2809 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2810 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2811
2812 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2813 .addReg(Src, 0, AMDGPU::sub0);
2814 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2815 .addReg(Src, 0, AMDGPU::sub1);
2816 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2817 .addImm(0x80000000);
2818
2819 // Set or toggle sign bit.
2820 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2821 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2822 .addReg(HiReg)
2823 .addReg(ConstReg)
2824 .setOperandDead(3); // Dead scc
2825 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2826 .addReg(LoReg)
2827 .addImm(AMDGPU::sub0)
2828 .addReg(OpReg)
2829 .addImm(AMDGPU::sub1);
2830 MI.eraseFromParent();
2831 return true;
2832}
2833
2834// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2835bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2836 Register Dst = MI.getOperand(0).getReg();
2837 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2838 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2839 MRI->getType(Dst) != LLT::scalar(64))
2840 return false;
2841
2842 Register Src = MI.getOperand(1).getReg();
2843 MachineBasicBlock *BB = MI.getParent();
2844 const DebugLoc &DL = MI.getDebugLoc();
2845 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2846 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2847 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2848 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2849
2850 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2851 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2852 return false;
2853
2854 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2855 .addReg(Src, 0, AMDGPU::sub0);
2856 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2857 .addReg(Src, 0, AMDGPU::sub1);
2858 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2859 .addImm(0x7fffffff);
2860
2861 // Clear sign bit.
2862 // TODO: Should this used S_BITSET0_*?
2863 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2864 .addReg(HiReg)
2865 .addReg(ConstReg)
2866 .setOperandDead(3); // Dead scc
2867 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2868 .addReg(LoReg)
2869 .addImm(AMDGPU::sub0)
2870 .addReg(OpReg)
2871 .addImm(AMDGPU::sub1);
2872
2873 MI.eraseFromParent();
2874 return true;
2875}
2876
2877static bool isConstant(const MachineInstr &MI) {
2878 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2879}
2880
2881void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2882 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2883
2884 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2885 const MachineInstr *PtrMI =
2886 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2887
2888 assert(PtrMI);
2889
2890 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2891 return;
2892
2893 GEPInfo GEPInfo;
2894
2895 for (unsigned i = 1; i != 3; ++i) {
2896 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2897 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2898 assert(OpDef);
2899 if (i == 2 && isConstant(*OpDef)) {
2900 // TODO: Could handle constant base + variable offset, but a combine
2901 // probably should have commuted it.
2902 assert(GEPInfo.Imm == 0);
2903 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2904 continue;
2905 }
2906 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2907 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2908 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2909 else
2910 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2911 }
2912
2913 AddrInfo.push_back(GEPInfo);
2914 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2915}
2916
2917bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2918 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2919}
2920
2921bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2922 if (!MI.hasOneMemOperand())
2923 return false;
2924
2925 const MachineMemOperand *MMO = *MI.memoperands_begin();
2926 const Value *Ptr = MMO->getValue();
2927
2928 // UndefValue means this is a load of a kernel input. These are uniform.
2929 // Sometimes LDS instructions have constant pointers.
2930 // If Ptr is null, then that means this mem operand contains a
2931 // PseudoSourceValue like GOT.
2932 if (!Ptr || isa<UndefValue, Argument, Constant, GlobalValue>(Ptr))
2933 return true;
2934
2936 return true;
2937
2938 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2939 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2940 AMDGPU::SGPRRegBankID;
2941
2942 const Instruction *I = dyn_cast<Instruction>(Ptr);
2943 return I && I->getMetadata("amdgpu.uniform");
2944}
2945
2946bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2947 for (const GEPInfo &GEPInfo : AddrInfo) {
2948 if (!GEPInfo.VgprParts.empty())
2949 return true;
2950 }
2951 return false;
2952}
2953
2954void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2955 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2956 unsigned AS = PtrTy.getAddressSpace();
2958 STI.ldsRequiresM0Init()) {
2959 MachineBasicBlock *BB = I.getParent();
2960
2961 // If DS instructions require M0 initialization, insert it before selecting.
2962 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2963 .addImm(-1);
2964 }
2965}
2966
2967bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2968 MachineInstr &I) const {
2969 initM0(I);
2970 return selectImpl(I, *CoverageInfo);
2971}
2972
2974 if (Reg.isPhysical())
2975 return false;
2976
2977 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2978 const unsigned Opcode = MI.getOpcode();
2979
2980 if (Opcode == AMDGPU::COPY)
2981 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2982
2983 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2984 Opcode == AMDGPU::G_XOR)
2985 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2986 isVCmpResult(MI.getOperand(2).getReg(), MRI);
2987
2988 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2989 return GI->is(Intrinsic::amdgcn_class);
2990
2991 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2992}
2993
2994bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2995 MachineBasicBlock *BB = I.getParent();
2996 MachineOperand &CondOp = I.getOperand(0);
2997 Register CondReg = CondOp.getReg();
2998 const DebugLoc &DL = I.getDebugLoc();
2999
3000 unsigned BrOpcode;
3001 Register CondPhysReg;
3002 const TargetRegisterClass *ConstrainRC;
3003
3004 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3005 // whether the branch is uniform when selecting the instruction. In
3006 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3007 // RegBankSelect knows what it's doing if the branch condition is scc, even
3008 // though it currently does not.
3009 if (!isVCC(CondReg, *MRI)) {
3010 if (MRI->getType(CondReg) != LLT::scalar(32))
3011 return false;
3012
3013 CondPhysReg = AMDGPU::SCC;
3014 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3015 ConstrainRC = &AMDGPU::SReg_32RegClass;
3016 } else {
3017 // FIXME: Should scc->vcc copies and with exec?
3018
3019 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3020 // need to insert an and with exec.
3021 if (!isVCmpResult(CondReg, *MRI)) {
3022 const bool Is64 = STI.isWave64();
3023 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3024 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3025
3026 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3027 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3028 .addReg(CondReg)
3029 .addReg(Exec)
3030 .setOperandDead(3); // Dead scc
3031 CondReg = TmpReg;
3032 }
3033
3034 CondPhysReg = TRI.getVCC();
3035 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3036 ConstrainRC = TRI.getBoolRC();
3037 }
3038
3039 if (!MRI->getRegClassOrNull(CondReg))
3040 MRI->setRegClass(CondReg, ConstrainRC);
3041
3042 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3043 .addReg(CondReg);
3044 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3045 .addMBB(I.getOperand(1).getMBB());
3046
3047 I.eraseFromParent();
3048 return true;
3049}
3050
3051bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3052 MachineInstr &I) const {
3053 Register DstReg = I.getOperand(0).getReg();
3054 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3055 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3056 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3057 if (IsVGPR)
3058 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3059
3060 return RBI.constrainGenericRegister(
3061 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3062}
3063
3064bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3065 Register DstReg = I.getOperand(0).getReg();
3066 Register SrcReg = I.getOperand(1).getReg();
3067 Register MaskReg = I.getOperand(2).getReg();
3068 LLT Ty = MRI->getType(DstReg);
3069 LLT MaskTy = MRI->getType(MaskReg);
3070 MachineBasicBlock *BB = I.getParent();
3071 const DebugLoc &DL = I.getDebugLoc();
3072
3073 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3074 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3075 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3076 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3077 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3078 return false;
3079
3080 // Try to avoid emitting a bit operation when we only need to touch half of
3081 // the 64-bit pointer.
3082 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3083 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3084 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3085
3086 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3087 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3088
3089 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3090 !CanCopyLow32 && !CanCopyHi32) {
3091 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3092 .addReg(SrcReg)
3093 .addReg(MaskReg)
3094 .setOperandDead(3); // Dead scc
3095 I.eraseFromParent();
3096 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3097 }
3098
3099 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3100 const TargetRegisterClass &RegRC
3101 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3102
3103 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3104 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3105 const TargetRegisterClass *MaskRC =
3106 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3107
3108 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3109 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3110 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3111 return false;
3112
3113 if (Ty.getSizeInBits() == 32) {
3114 assert(MaskTy.getSizeInBits() == 32 &&
3115 "ptrmask should have been narrowed during legalize");
3116
3117 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3118 .addReg(SrcReg)
3119 .addReg(MaskReg);
3120
3121 if (!IsVGPR)
3122 NewOp.setOperandDead(3); // Dead scc
3123 I.eraseFromParent();
3124 return true;
3125 }
3126
3127 Register HiReg = MRI->createVirtualRegister(&RegRC);
3128 Register LoReg = MRI->createVirtualRegister(&RegRC);
3129
3130 // Extract the subregisters from the source pointer.
3131 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3132 .addReg(SrcReg, 0, AMDGPU::sub0);
3133 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3134 .addReg(SrcReg, 0, AMDGPU::sub1);
3135
3136 Register MaskedLo, MaskedHi;
3137
3138 if (CanCopyLow32) {
3139 // If all the bits in the low half are 1, we only need a copy for it.
3140 MaskedLo = LoReg;
3141 } else {
3142 // Extract the mask subregister and apply the and.
3143 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3144 MaskedLo = MRI->createVirtualRegister(&RegRC);
3145
3146 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3147 .addReg(MaskReg, 0, AMDGPU::sub0);
3148 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3149 .addReg(LoReg)
3150 .addReg(MaskLo);
3151 }
3152
3153 if (CanCopyHi32) {
3154 // If all the bits in the high half are 1, we only need a copy for it.
3155 MaskedHi = HiReg;
3156 } else {
3157 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3158 MaskedHi = MRI->createVirtualRegister(&RegRC);
3159
3160 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3161 .addReg(MaskReg, 0, AMDGPU::sub1);
3162 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3163 .addReg(HiReg)
3164 .addReg(MaskHi);
3165 }
3166
3167 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3168 .addReg(MaskedLo)
3169 .addImm(AMDGPU::sub0)
3170 .addReg(MaskedHi)
3171 .addImm(AMDGPU::sub1);
3172 I.eraseFromParent();
3173 return true;
3174}
3175
3176/// Return the register to use for the index value, and the subregister to use
3177/// for the indirectly accessed register.
3178static std::pair<Register, unsigned>
3180 const TargetRegisterClass *SuperRC, Register IdxReg,
3181 unsigned EltSize, GISelValueTracking &ValueTracking) {
3182 Register IdxBaseReg;
3183 int Offset;
3184
3185 std::tie(IdxBaseReg, Offset) =
3186 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3187 if (IdxBaseReg == AMDGPU::NoRegister) {
3188 // This will happen if the index is a known constant. This should ordinarily
3189 // be legalized out, but handle it as a register just in case.
3190 assert(Offset == 0);
3191 IdxBaseReg = IdxReg;
3192 }
3193
3194 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3195
3196 // Skip out of bounds offsets, or else we would end up using an undefined
3197 // register.
3198 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3199 return std::pair(IdxReg, SubRegs[0]);
3200 return std::pair(IdxBaseReg, SubRegs[Offset]);
3201}
3202
3203bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3204 MachineInstr &MI) const {
3205 Register DstReg = MI.getOperand(0).getReg();
3206 Register SrcReg = MI.getOperand(1).getReg();
3207 Register IdxReg = MI.getOperand(2).getReg();
3208
3209 LLT DstTy = MRI->getType(DstReg);
3210 LLT SrcTy = MRI->getType(SrcReg);
3211
3212 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3213 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3214 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3215
3216 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3217 // into a waterfall loop.
3218 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3219 return false;
3220
3221 const TargetRegisterClass *SrcRC =
3222 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3223 const TargetRegisterClass *DstRC =
3224 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3225 if (!SrcRC || !DstRC)
3226 return false;
3227 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3228 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3229 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3230 return false;
3231
3232 MachineBasicBlock *BB = MI.getParent();
3233 const DebugLoc &DL = MI.getDebugLoc();
3234 const bool Is64 = DstTy.getSizeInBits() == 64;
3235
3236 unsigned SubReg;
3237 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3238 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3239
3240 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3241 if (DstTy.getSizeInBits() != 32 && !Is64)
3242 return false;
3243
3244 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3245 .addReg(IdxReg);
3246
3247 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3248 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3249 .addReg(SrcReg, 0, SubReg)
3250 .addReg(SrcReg, RegState::Implicit);
3251 MI.eraseFromParent();
3252 return true;
3253 }
3254
3255 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3256 return false;
3257
3258 if (!STI.useVGPRIndexMode()) {
3259 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3260 .addReg(IdxReg);
3261 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3262 .addReg(SrcReg, 0, SubReg)
3263 .addReg(SrcReg, RegState::Implicit);
3264 MI.eraseFromParent();
3265 return true;
3266 }
3267
3268 const MCInstrDesc &GPRIDXDesc =
3269 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3270 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3271 .addReg(SrcReg)
3272 .addReg(IdxReg)
3273 .addImm(SubReg);
3274
3275 MI.eraseFromParent();
3276 return true;
3277}
3278
3279// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3280bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3281 MachineInstr &MI) const {
3282 Register DstReg = MI.getOperand(0).getReg();
3283 Register VecReg = MI.getOperand(1).getReg();
3284 Register ValReg = MI.getOperand(2).getReg();
3285 Register IdxReg = MI.getOperand(3).getReg();
3286
3287 LLT VecTy = MRI->getType(DstReg);
3288 LLT ValTy = MRI->getType(ValReg);
3289 unsigned VecSize = VecTy.getSizeInBits();
3290 unsigned ValSize = ValTy.getSizeInBits();
3291
3292 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3293 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3294 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3295
3296 assert(VecTy.getElementType() == ValTy);
3297
3298 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3299 // into a waterfall loop.
3300 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3301 return false;
3302
3303 const TargetRegisterClass *VecRC =
3304 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3305 const TargetRegisterClass *ValRC =
3306 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3307
3308 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3309 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3310 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3311 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3312 return false;
3313
3314 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3315 return false;
3316
3317 unsigned SubReg;
3318 std::tie(IdxReg, SubReg) =
3319 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3320
3321 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3322 STI.useVGPRIndexMode();
3323
3324 MachineBasicBlock *BB = MI.getParent();
3325 const DebugLoc &DL = MI.getDebugLoc();
3326
3327 if (!IndexMode) {
3328 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3329 .addReg(IdxReg);
3330
3331 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3332 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3333 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3334 .addReg(VecReg)
3335 .addReg(ValReg)
3336 .addImm(SubReg);
3337 MI.eraseFromParent();
3338 return true;
3339 }
3340
3341 const MCInstrDesc &GPRIDXDesc =
3342 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3343 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3344 .addReg(VecReg)
3345 .addReg(ValReg)
3346 .addReg(IdxReg)
3347 .addImm(SubReg);
3348
3349 MI.eraseFromParent();
3350 return true;
3351}
3352
3353bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3354 if (!Subtarget->hasVMemToLDSLoad())
3355 return false;
3356 unsigned Opc;
3357 unsigned Size = MI.getOperand(3).getImm();
3358
3359 // The struct intrinsic variants add one additional operand over raw.
3360 const bool HasVIndex = MI.getNumOperands() == 9;
3361 Register VIndex;
3362 int OpOffset = 0;
3363 if (HasVIndex) {
3364 VIndex = MI.getOperand(4).getReg();
3365 OpOffset = 1;
3366 }
3367
3368 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3369 std::optional<ValueAndVReg> MaybeVOffset =
3371 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3372
3373 switch (Size) {
3374 default:
3375 return false;
3376 case 1:
3377 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3378 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3379 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3380 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3381 break;
3382 case 2:
3383 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3384 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3385 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3386 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3387 break;
3388 case 4:
3389 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3390 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3391 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3392 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3393 break;
3394 case 12:
3395 if (!Subtarget->hasLDSLoadB96_B128())
3396 return false;
3397
3398 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3399 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3400 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3401 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3402 break;
3403 case 16:
3404 if (!Subtarget->hasLDSLoadB96_B128())
3405 return false;
3406
3407 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3408 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3409 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3410 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3411 break;
3412 }
3413
3414 MachineBasicBlock *MBB = MI.getParent();
3415 const DebugLoc &DL = MI.getDebugLoc();
3416 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3417 .add(MI.getOperand(2));
3418
3419 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3420
3421 if (HasVIndex && HasVOffset) {
3422 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3423 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3424 .addReg(VIndex)
3425 .addImm(AMDGPU::sub0)
3426 .addReg(VOffset)
3427 .addImm(AMDGPU::sub1);
3428
3429 MIB.addReg(IdxReg);
3430 } else if (HasVIndex) {
3431 MIB.addReg(VIndex);
3432 } else if (HasVOffset) {
3433 MIB.addReg(VOffset);
3434 }
3435
3436 MIB.add(MI.getOperand(1)); // rsrc
3437 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3438 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3439 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3440 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3441 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3442 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3443 MIB.addImm(
3444 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3445 ? 1
3446 : 0); // swz
3447
3448 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3449 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3450 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3451 MachinePointerInfo StorePtrI = LoadPtrI;
3452 StorePtrI.V = nullptr;
3454
3455 auto F = LoadMMO->getFlags() &
3457 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3458 Size, LoadMMO->getBaseAlign());
3459
3460 MachineMemOperand *StoreMMO =
3462 sizeof(int32_t), LoadMMO->getBaseAlign());
3463
3464 MIB.setMemRefs({LoadMMO, StoreMMO});
3465
3466 MI.eraseFromParent();
3467 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3468}
3469
3470/// Match a zero extend from a 32-bit value to 64-bits.
3471Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3472 Register ZExtSrc;
3473 if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
3474 return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3475
3476 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3477 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3478 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3479 return Register();
3480
3481 assert(Def->getNumOperands() == 3 &&
3482 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3483 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
3484 return Def->getOperand(1).getReg();
3485 }
3486
3487 return Register();
3488}
3489
3490/// Match a sign extend from a 32-bit value to 64-bits.
3491Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3492 Register SExtSrc;
3493 if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
3494 return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
3495
3496 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3497 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3498 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3499 return Register();
3500
3501 assert(Def->getNumOperands() == 3 &&
3502 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3503 if (mi_match(Def->getOperand(2).getReg(), *MRI,
3504 m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
3505 m_SpecificICst(31))))
3506 return Def->getOperand(1).getReg();
3507
3508 if (VT->signBitIsZero(Reg))
3509 return matchZeroExtendFromS32(Reg);
3510
3511 return Register();
3512}
3513
3514/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3515/// is 32-bit.
3517AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3518 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3519 : matchZeroExtendFromS32(Reg);
3520}
3521
3522/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3523/// is 32-bit.
3525AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3526 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3527 : matchSignExtendFromS32(Reg);
3528}
3529
3531AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3532 bool IsSigned) const {
3533 if (IsSigned)
3534 return matchSignExtendFromS32OrS32(Reg);
3535
3536 return matchZeroExtendFromS32OrS32(Reg);
3537}
3538
3539Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3540 Register AnyExtSrc;
3541 if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
3542 return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
3543
3544 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3545 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3546 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3547 return Register();
3548
3549 assert(Def->getNumOperands() == 3 &&
3550 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3551
3552 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
3553 return Def->getOperand(1).getReg();
3554
3555 return Register();
3556}
3557
3558bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3559 if (!Subtarget->hasVMemToLDSLoad())
3560 return false;
3561
3562 unsigned Opc;
3563 unsigned Size = MI.getOperand(3).getImm();
3564
3565 switch (Size) {
3566 default:
3567 return false;
3568 case 1:
3569 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3570 break;
3571 case 2:
3572 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3573 break;
3574 case 4:
3575 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3576 break;
3577 case 12:
3578 if (!Subtarget->hasLDSLoadB96_B128())
3579 return false;
3580 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3581 break;
3582 case 16:
3583 if (!Subtarget->hasLDSLoadB96_B128())
3584 return false;
3585 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3586 break;
3587 }
3588
3589 MachineBasicBlock *MBB = MI.getParent();
3590 const DebugLoc &DL = MI.getDebugLoc();
3591 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3592 .add(MI.getOperand(2));
3593
3594 Register Addr = MI.getOperand(1).getReg();
3595 Register VOffset;
3596 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3597 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3598 if (!isSGPR(Addr)) {
3599 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3600 if (isSGPR(AddrDef->Reg)) {
3601 Addr = AddrDef->Reg;
3602 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3603 Register SAddr =
3604 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3605 if (isSGPR(SAddr)) {
3606 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3607 if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
3608 Addr = SAddr;
3609 VOffset = Off;
3610 }
3611 }
3612 }
3613 }
3614
3615 if (isSGPR(Addr)) {
3617 if (!VOffset) {
3618 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3619 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3620 .addImm(0);
3621 }
3622 }
3623
3624 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3625 .addReg(Addr);
3626
3627 if (isSGPR(Addr))
3628 MIB.addReg(VOffset);
3629
3630 MIB.add(MI.getOperand(4)) // offset
3631 .add(MI.getOperand(5)); // cpol
3632
3633 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3634 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3635 LoadPtrI.Offset = MI.getOperand(4).getImm();
3636 MachinePointerInfo StorePtrI = LoadPtrI;
3639 auto F = LoadMMO->getFlags() &
3641 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3642 Size, LoadMMO->getBaseAlign());
3643 MachineMemOperand *StoreMMO =
3645 sizeof(int32_t), Align(4));
3646
3647 MIB.setMemRefs({LoadMMO, StoreMMO});
3648
3649 MI.eraseFromParent();
3650 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3651}
3652
3653bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3654 MachineInstr &MI) const {
3655 unsigned OpcodeOpIdx =
3656 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3657 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3658 MI.removeOperand(OpcodeOpIdx);
3659 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3660 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3661}
3662
3663// FIXME: This should be removed and let the patterns select. We just need the
3664// AGPR/VGPR combination versions.
3665bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3666 unsigned Opc;
3667 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3668 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3669 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3670 break;
3671 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3672 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3673 break;
3674 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3675 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3676 break;
3677 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3678 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3679 break;
3680 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3681 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3682 break;
3683 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3684 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3685 break;
3686 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3687 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3688 break;
3689 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3690 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3691 break;
3692 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3693 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3694 break;
3695 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3696 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3697 break;
3698 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3699 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3700 break;
3701 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3702 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3703 break;
3704 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3705 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3706 break;
3707 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3708 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3709 break;
3710 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3711 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3712 break;
3713 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3714 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3715 break;
3716 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3717 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3718 break;
3719 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3720 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3721 break;
3722 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3723 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3724 break;
3725 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3726 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3727 break;
3728 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3729 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3730 break;
3731 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3732 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3733 break;
3734 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3735 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3736 break;
3737 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3738 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3739 break;
3740 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3741 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3742 break;
3743 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3744 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3745 break;
3746 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3747 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3748 break;
3749 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3750 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3751 break;
3752 default:
3753 llvm_unreachable("unhandled smfmac intrinsic");
3754 }
3755
3756 auto VDst_In = MI.getOperand(4);
3757
3758 MI.setDesc(TII.get(Opc));
3759 MI.removeOperand(4); // VDst_In
3760 MI.removeOperand(1); // Intrinsic ID
3761 MI.addOperand(VDst_In); // Readd VDst_In to the end
3762 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3763 return true;
3764}
3765
3766bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3767 MachineInstr &MI, Intrinsic::ID IntrID) const {
3768 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3769 !Subtarget->hasPermlane16Swap())
3770 return false;
3771 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3772 !Subtarget->hasPermlane32Swap())
3773 return false;
3774
3775 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3776 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3777 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3778
3779 MI.removeOperand(2);
3780 MI.setDesc(TII.get(Opcode));
3781 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3782
3783 MachineOperand &FI = MI.getOperand(4);
3785
3786 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3787}
3788
3789bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3790 Register DstReg = MI.getOperand(0).getReg();
3791 Register SrcReg = MI.getOperand(1).getReg();
3792 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3793 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3794 MachineBasicBlock *MBB = MI.getParent();
3795 const DebugLoc &DL = MI.getDebugLoc();
3796
3797 if (IsVALU) {
3798 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3799 .addImm(Subtarget->getWavefrontSizeLog2())
3800 .addReg(SrcReg);
3801 } else {
3802 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3803 .addReg(SrcReg)
3804 .addImm(Subtarget->getWavefrontSizeLog2())
3805 .setOperandDead(3); // Dead scc
3806 }
3807
3808 const TargetRegisterClass &RC =
3809 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3810 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3811 return false;
3812
3813 MI.eraseFromParent();
3814 return true;
3815}
3816
3817// Match BITOP3 operation and return a number of matched instructions plus
3818// truth table.
3819static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
3821 const MachineRegisterInfo &MRI) {
3822 unsigned NumOpcodes = 0;
3823 uint8_t LHSBits, RHSBits;
3824
3825 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
3826 // Define truth table given Src0, Src1, Src2 bits permutations:
3827 // 0 0 0
3828 // 0 0 1
3829 // 0 1 0
3830 // 0 1 1
3831 // 1 0 0
3832 // 1 0 1
3833 // 1 1 0
3834 // 1 1 1
3835 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3836
3837 if (mi_match(Op, MRI, m_AllOnesInt())) {
3838 Bits = 0xff;
3839 return true;
3840 }
3841 if (mi_match(Op, MRI, m_ZeroInt())) {
3842 Bits = 0;
3843 return true;
3844 }
3845
3846 for (unsigned I = 0; I < Src.size(); ++I) {
3847 // Try to find existing reused operand
3848 if (Src[I] == Op) {
3849 Bits = SrcBits[I];
3850 return true;
3851 }
3852 // Try to replace parent operator
3853 if (Src[I] == R) {
3854 Bits = SrcBits[I];
3855 Src[I] = Op;
3856 return true;
3857 }
3858 }
3859
3860 if (Src.size() == 3) {
3861 // No room left for operands. Try one last time, there can be a 'not' of
3862 // one of our source operands. In this case we can compute the bits
3863 // without growing Src vector.
3864 Register LHS;
3865 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
3867 for (unsigned I = 0; I < Src.size(); ++I) {
3868 if (Src[I] == LHS) {
3869 Bits = ~SrcBits[I];
3870 return true;
3871 }
3872 }
3873 }
3874
3875 return false;
3876 }
3877
3878 Bits = SrcBits[Src.size()];
3879 Src.push_back(Op);
3880 return true;
3881 };
3882
3883 MachineInstr *MI = MRI.getVRegDef(R);
3884 switch (MI->getOpcode()) {
3885 case TargetOpcode::G_AND:
3886 case TargetOpcode::G_OR:
3887 case TargetOpcode::G_XOR: {
3888 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
3889 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
3890
3891 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
3892 if (!getOperandBits(LHS, LHSBits) ||
3893 !getOperandBits(RHS, RHSBits)) {
3894 Src = Backup;
3895 return std::make_pair(0, 0);
3896 }
3897
3898 // Recursion is naturally limited by the size of the operand vector.
3899 auto Op = BitOp3_Op(LHS, Src, MRI);
3900 if (Op.first) {
3901 NumOpcodes += Op.first;
3902 LHSBits = Op.second;
3903 }
3904
3905 Op = BitOp3_Op(RHS, Src, MRI);
3906 if (Op.first) {
3907 NumOpcodes += Op.first;
3908 RHSBits = Op.second;
3909 }
3910 break;
3911 }
3912 default:
3913 return std::make_pair(0, 0);
3914 }
3915
3916 uint8_t TTbl;
3917 switch (MI->getOpcode()) {
3918 case TargetOpcode::G_AND:
3919 TTbl = LHSBits & RHSBits;
3920 break;
3921 case TargetOpcode::G_OR:
3922 TTbl = LHSBits | RHSBits;
3923 break;
3924 case TargetOpcode::G_XOR:
3925 TTbl = LHSBits ^ RHSBits;
3926 break;
3927 default:
3928 break;
3929 }
3930
3931 return std::make_pair(NumOpcodes + 1, TTbl);
3932}
3933
3934bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
3935 if (!Subtarget->hasBitOp3Insts())
3936 return false;
3937
3938 Register DstReg = MI.getOperand(0).getReg();
3939 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3940 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3941 if (!IsVALU)
3942 return false;
3943
3945 uint8_t TTbl;
3946 unsigned NumOpcodes;
3947
3948 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
3949
3950 // Src.empty() case can happen if all operands are all zero or all ones.
3951 // Normally it shall be optimized out before reaching this.
3952 if (NumOpcodes < 2 || Src.empty())
3953 return false;
3954
3955 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
3956 if (NumOpcodes == 2 && IsB32) {
3957 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3958 // asm more readable. This cannot be modeled with AddedComplexity because
3959 // selector does not know how many operations did we match.
3960 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
3961 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
3962 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
3963 return false;
3964 } else if (NumOpcodes < 4) {
3965 // For a uniform case threshold should be higher to account for moves
3966 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
3967 // in SGPRs and a readtfirstlane after.
3968 return false;
3969 }
3970
3971 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
3972 if (!IsB32 && STI.hasTrue16BitInsts())
3973 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
3974 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
3975 unsigned CBL = STI.getConstantBusLimit(Opc);
3976 MachineBasicBlock *MBB = MI.getParent();
3977 const DebugLoc &DL = MI.getDebugLoc();
3978
3979 for (unsigned I = 0; I < Src.size(); ++I) {
3980 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
3981 if (RB->getID() != AMDGPU::SGPRRegBankID)
3982 continue;
3983 if (CBL > 0) {
3984 --CBL;
3985 continue;
3986 }
3987 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3988 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
3989 .addReg(Src[I]);
3990 Src[I] = NewReg;
3991 }
3992
3993 // Last operand can be ignored, turning a ternary operation into a binary.
3994 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3995 // 'c' with 'a' here without changing the answer. In some pathological
3996 // cases it should be possible to get an operation with a single operand
3997 // too if optimizer would not catch it.
3998 while (Src.size() < 3)
3999 Src.push_back(Src[0]);
4000
4001 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
4002 if (!IsB32)
4003 MIB.addImm(0); // src_mod0
4004 MIB.addReg(Src[0]);
4005 if (!IsB32)
4006 MIB.addImm(0); // src_mod1
4007 MIB.addReg(Src[1]);
4008 if (!IsB32)
4009 MIB.addImm(0); // src_mod2
4010 MIB.addReg(Src[2])
4011 .addImm(TTbl);
4012 if (!IsB32)
4013 MIB.addImm(0); // op_sel
4014
4015 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
4016 MI.eraseFromParent();
4017
4018 return true;
4019}
4020
4021bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4022 Register SrcReg = MI.getOperand(0).getReg();
4023 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
4024 return false;
4025
4026 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
4027 Register SP =
4029 Register WaveAddr = getWaveAddress(DefMI);
4030 MachineBasicBlock *MBB = MI.getParent();
4031 const DebugLoc &DL = MI.getDebugLoc();
4032
4033 if (!WaveAddr) {
4034 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4035 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
4036 .addReg(SrcReg)
4037 .addImm(Subtarget->getWavefrontSizeLog2())
4038 .setOperandDead(3); // Dead scc
4039 }
4040
4041 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
4042 .addReg(WaveAddr);
4043
4044 MI.eraseFromParent();
4045 return true;
4046}
4047
4049
4050 if (!I.isPreISelOpcode()) {
4051 if (I.isCopy())
4052 return selectCOPY(I);
4053 return true;
4054 }
4055
4056 switch (I.getOpcode()) {
4057 case TargetOpcode::G_AND:
4058 case TargetOpcode::G_OR:
4059 case TargetOpcode::G_XOR:
4060 if (selectBITOP3(I))
4061 return true;
4062 if (selectImpl(I, *CoverageInfo))
4063 return true;
4064 return selectG_AND_OR_XOR(I);
4065 case TargetOpcode::G_ADD:
4066 case TargetOpcode::G_SUB:
4067 case TargetOpcode::G_PTR_ADD:
4068 if (selectImpl(I, *CoverageInfo))
4069 return true;
4070 return selectG_ADD_SUB(I);
4071 case TargetOpcode::G_UADDO:
4072 case TargetOpcode::G_USUBO:
4073 case TargetOpcode::G_UADDE:
4074 case TargetOpcode::G_USUBE:
4075 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4076 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4077 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4078 return selectG_AMDGPU_MAD_64_32(I);
4079 case TargetOpcode::G_INTTOPTR:
4080 case TargetOpcode::G_BITCAST:
4081 case TargetOpcode::G_PTRTOINT:
4082 case TargetOpcode::G_FREEZE:
4083 return selectCOPY(I);
4084 case TargetOpcode::G_FNEG:
4085 if (selectImpl(I, *CoverageInfo))
4086 return true;
4087 return selectG_FNEG(I);
4088 case TargetOpcode::G_FABS:
4089 if (selectImpl(I, *CoverageInfo))
4090 return true;
4091 return selectG_FABS(I);
4092 case TargetOpcode::G_EXTRACT:
4093 return selectG_EXTRACT(I);
4094 case TargetOpcode::G_MERGE_VALUES:
4095 case TargetOpcode::G_CONCAT_VECTORS:
4096 return selectG_MERGE_VALUES(I);
4097 case TargetOpcode::G_UNMERGE_VALUES:
4098 return selectG_UNMERGE_VALUES(I);
4099 case TargetOpcode::G_BUILD_VECTOR:
4100 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4101 return selectG_BUILD_VECTOR(I);
4102 case TargetOpcode::G_IMPLICIT_DEF:
4103 return selectG_IMPLICIT_DEF(I);
4104 case TargetOpcode::G_INSERT:
4105 return selectG_INSERT(I);
4106 case TargetOpcode::G_INTRINSIC:
4107 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4108 return selectG_INTRINSIC(I);
4109 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4110 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4111 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4112 case TargetOpcode::G_ICMP:
4113 case TargetOpcode::G_FCMP:
4114 if (selectG_ICMP_or_FCMP(I))
4115 return true;
4116 return selectImpl(I, *CoverageInfo);
4117 case TargetOpcode::G_LOAD:
4118 case TargetOpcode::G_ZEXTLOAD:
4119 case TargetOpcode::G_SEXTLOAD:
4120 case TargetOpcode::G_STORE:
4121 case TargetOpcode::G_ATOMIC_CMPXCHG:
4122 case TargetOpcode::G_ATOMICRMW_XCHG:
4123 case TargetOpcode::G_ATOMICRMW_ADD:
4124 case TargetOpcode::G_ATOMICRMW_SUB:
4125 case TargetOpcode::G_ATOMICRMW_AND:
4126 case TargetOpcode::G_ATOMICRMW_OR:
4127 case TargetOpcode::G_ATOMICRMW_XOR:
4128 case TargetOpcode::G_ATOMICRMW_MIN:
4129 case TargetOpcode::G_ATOMICRMW_MAX:
4130 case TargetOpcode::G_ATOMICRMW_UMIN:
4131 case TargetOpcode::G_ATOMICRMW_UMAX:
4132 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4133 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4134 case TargetOpcode::G_ATOMICRMW_FADD:
4135 case TargetOpcode::G_ATOMICRMW_FMIN:
4136 case TargetOpcode::G_ATOMICRMW_FMAX:
4137 return selectG_LOAD_STORE_ATOMICRMW(I);
4138 case TargetOpcode::G_SELECT:
4139 return selectG_SELECT(I);
4140 case TargetOpcode::G_TRUNC:
4141 return selectG_TRUNC(I);
4142 case TargetOpcode::G_SEXT:
4143 case TargetOpcode::G_ZEXT:
4144 case TargetOpcode::G_ANYEXT:
4145 case TargetOpcode::G_SEXT_INREG:
4146 // This is a workaround. For extension from type i1, `selectImpl()` uses
4147 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4148 // i1 can only be hold in a SGPR class.
4149 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4150 selectImpl(I, *CoverageInfo))
4151 return true;
4152 return selectG_SZA_EXT(I);
4153 case TargetOpcode::G_FPEXT:
4154 if (selectG_FPEXT(I))
4155 return true;
4156 return selectImpl(I, *CoverageInfo);
4157 case TargetOpcode::G_BRCOND:
4158 return selectG_BRCOND(I);
4159 case TargetOpcode::G_GLOBAL_VALUE:
4160 return selectG_GLOBAL_VALUE(I);
4161 case TargetOpcode::G_PTRMASK:
4162 return selectG_PTRMASK(I);
4163 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4164 return selectG_EXTRACT_VECTOR_ELT(I);
4165 case TargetOpcode::G_INSERT_VECTOR_ELT:
4166 return selectG_INSERT_VECTOR_ELT(I);
4167 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4168 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4169 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4170 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4171 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4174 assert(Intr && "not an image intrinsic with image pseudo");
4175 return selectImageIntrinsic(I, Intr);
4176 }
4177 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4178 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4179 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4180 return selectBVHIntersectRayIntrinsic(I);
4181 case AMDGPU::G_SBFX:
4182 case AMDGPU::G_UBFX:
4183 return selectG_SBFX_UBFX(I);
4184 case AMDGPU::G_SI_CALL:
4185 I.setDesc(TII.get(AMDGPU::SI_CALL));
4186 return true;
4187 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4188 return selectWaveAddress(I);
4189 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4190 I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4191 return true;
4192 }
4193 case AMDGPU::G_STACKRESTORE:
4194 return selectStackRestore(I);
4195 case AMDGPU::G_PHI:
4196 return selectPHI(I);
4197 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4198 return selectCOPY_SCC_VCC(I);
4199 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4200 return selectCOPY_VCC_SCC(I);
4201 case AMDGPU::G_AMDGPU_READANYLANE:
4202 return selectReadAnyLane(I);
4203 case TargetOpcode::G_CONSTANT:
4204 case TargetOpcode::G_FCONSTANT:
4205 default:
4206 return selectImpl(I, *CoverageInfo);
4207 }
4208 return false;
4209}
4210
4212AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4213 return {{
4214 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4215 }};
4216
4217}
4218
4219std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4220 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4221 unsigned Mods = 0;
4222 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4223
4224 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4225 Src = MI->getOperand(1).getReg();
4226 Mods |= SISrcMods::NEG;
4227 MI = getDefIgnoringCopies(Src, *MRI);
4228 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4229 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4230 // denormal mode, but we're implicitly canonicalizing in a source operand.
4231 const ConstantFP *LHS =
4232 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4233 if (LHS && LHS->isZero()) {
4234 Mods |= SISrcMods::NEG;
4235 Src = MI->getOperand(2).getReg();
4236 }
4237 }
4238
4239 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4240 Src = MI->getOperand(1).getReg();
4241 Mods |= SISrcMods::ABS;
4242 }
4243
4244 if (OpSel)
4245 Mods |= SISrcMods::OP_SEL_0;
4246
4247 return std::pair(Src, Mods);
4248}
4249
4250Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4251 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4252 bool ForceVGPR) const {
4253 if ((Mods != 0 || ForceVGPR) &&
4254 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4255
4256 // If we looked through copies to find source modifiers on an SGPR operand,
4257 // we now have an SGPR register source. To avoid potentially violating the
4258 // constant bus restriction, we need to insert a copy to a VGPR.
4259 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4260 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4261 TII.get(AMDGPU::COPY), VGPRSrc)
4262 .addReg(Src);
4263 Src = VGPRSrc;
4264 }
4265
4266 return Src;
4267}
4268
4269///
4270/// This will select either an SGPR or VGPR operand and will save us from
4271/// having to write an extra tablegen pattern.
4273AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4274 return {{
4275 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4276 }};
4277}
4278
4280AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4281 Register Src;
4282 unsigned Mods;
4283 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4284
4285 return {{
4286 [=](MachineInstrBuilder &MIB) {
4287 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4288 },
4289 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4290 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4291 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4292 }};
4293}
4294
4296AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4297 Register Src;
4298 unsigned Mods;
4299 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4300 /*IsCanonicalizing=*/true,
4301 /*AllowAbs=*/false);
4302
4303 return {{
4304 [=](MachineInstrBuilder &MIB) {
4305 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4306 },
4307 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4308 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4309 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4310 }};
4311}
4312
4314AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4315 return {{
4316 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4317 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4318 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4319 }};
4320}
4321
4323AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4324 Register Src;
4325 unsigned Mods;
4326 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4327
4328 return {{
4329 [=](MachineInstrBuilder &MIB) {
4330 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4331 },
4332 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4333 }};
4334}
4335
4337AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4338 MachineOperand &Root) const {
4339 Register Src;
4340 unsigned Mods;
4341 std::tie(Src, Mods) =
4342 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4343
4344 return {{
4345 [=](MachineInstrBuilder &MIB) {
4346 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4347 },
4348 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4349 }};
4350}
4351
4353AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4354 Register Src;
4355 unsigned Mods;
4356 std::tie(Src, Mods) =
4357 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4358 /*AllowAbs=*/false);
4359
4360 return {{
4361 [=](MachineInstrBuilder &MIB) {
4362 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4363 },
4364 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4365 }};
4366}
4367
4369AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4370 Register Reg = Root.getReg();
4371 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4372 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4373 return {};
4374 return {{
4375 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4376 }};
4377}
4378
4379enum class SrcStatus {
4380 IS_SAME,
4384 // This means current op = [op_upper, op_lower] and src = -op_lower.
4386 IS_HI_NEG,
4387 // This means current op = [op_upper, op_lower] and src = [op_upper,
4388 // -op_lower].
4389 IS_LO_NEG,
4391 INVALID,
4396};
4397/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4398static bool isTruncHalf(const MachineInstr *MI,
4399 const MachineRegisterInfo &MRI) {
4400 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4401 return false;
4402
4403 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4404 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4405 return DstSize * 2 == SrcSize;
4406}
4407
4408/// Test if the MI is logic shift right with half bits,
4409/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4410static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4411 if (MI->getOpcode() != AMDGPU::G_LSHR)
4412 return false;
4413
4414 Register ShiftSrc;
4415 std::optional<ValueAndVReg> ShiftAmt;
4416 if (mi_match(MI->getOperand(0).getReg(), MRI,
4417 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4418 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4419 unsigned Shift = ShiftAmt->Value.getZExtValue();
4420 return Shift * 2 == SrcSize;
4421 }
4422 return false;
4423}
4424
4425/// Test if the MI is shift left with half bits,
4426/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4427static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4428 if (MI->getOpcode() != AMDGPU::G_SHL)
4429 return false;
4430
4431 Register ShiftSrc;
4432 std::optional<ValueAndVReg> ShiftAmt;
4433 if (mi_match(MI->getOperand(0).getReg(), MRI,
4434 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4435 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4436 unsigned Shift = ShiftAmt->Value.getZExtValue();
4437 return Shift * 2 == SrcSize;
4438 }
4439 return false;
4440}
4441
4442/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4443static bool isUnmergeHalf(const MachineInstr *MI,
4444 const MachineRegisterInfo &MRI) {
4445 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4446 return false;
4447 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4448 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4449}
4450
4452
4454 const MachineRegisterInfo &MRI) {
4455 LLT OpTy = MRI.getType(Reg);
4456 if (OpTy.isScalar())
4457 return TypeClass::SCALAR;
4458 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4459 return TypeClass::VECTOR_OF_TWO;
4460 return TypeClass::NONE_OF_LISTED;
4461}
4462
4464 const MachineRegisterInfo &MRI) {
4465 TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI);
4466 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4467 return SrcStatus::INVALID;
4468
4469 switch (S) {
4470 case SrcStatus::IS_SAME:
4471 if (NegType == TypeClass::VECTOR_OF_TWO) {
4472 // Vector of 2:
4473 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4474 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4475 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4476 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4477 return SrcStatus::IS_BOTH_NEG;
4478 }
4479 if (NegType == TypeClass::SCALAR) {
4480 // Scalar:
4481 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4482 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4483 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4484 // [SrcHi, SrcLo] = [-OpHi, OpLo]
4485 return SrcStatus::IS_HI_NEG;
4486 }
4487 break;
4488 case SrcStatus::IS_HI_NEG:
4489 if (NegType == TypeClass::VECTOR_OF_TWO) {
4490 // Vector of 2:
4491 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4492 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4493 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4494 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4495 return SrcStatus::IS_LO_NEG;
4496 }
4497 if (NegType == TypeClass::SCALAR) {
4498 // Scalar:
4499 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4500 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4501 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4502 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
4503 return SrcStatus::IS_SAME;
4504 }
4505 break;
4506 case SrcStatus::IS_LO_NEG:
4507 if (NegType == TypeClass::VECTOR_OF_TWO) {
4508 // Vector of 2:
4509 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4510 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4511 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4512 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4513 return SrcStatus::IS_HI_NEG;
4514 }
4515 if (NegType == TypeClass::SCALAR) {
4516 // Scalar:
4517 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4518 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4519 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4520 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4521 return SrcStatus::IS_BOTH_NEG;
4522 }
4523 break;
4524 case SrcStatus::IS_BOTH_NEG:
4525 if (NegType == TypeClass::VECTOR_OF_TWO) {
4526 // Vector of 2:
4527 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4528 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4529 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4530 // [SrcHi, SrcLo] = [OpHi, OpLo]
4531 return SrcStatus::IS_SAME;
4532 }
4533 if (NegType == TypeClass::SCALAR) {
4534 // Scalar:
4535 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4536 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4537 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4538 // [SrcHi, SrcLo] = [OpHi, -OpLo]
4539 return SrcStatus::IS_LO_NEG;
4540 }
4541 break;
4542 case SrcStatus::IS_UPPER_HALF:
4543 // Vector of 2:
4544 // Src = CurrUpper
4545 // Curr = [CurrUpper, CurrLower]
4546 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4547 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4548 // Src = -OpUpper
4549 //
4550 // Scalar:
4551 // Src = CurrUpper
4552 // Curr = [CurrUpper, CurrLower]
4553 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4554 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4555 // Src = -OpUpper
4556 return SrcStatus::IS_UPPER_HALF_NEG;
4557 case SrcStatus::IS_LOWER_HALF:
4558 if (NegType == TypeClass::VECTOR_OF_TWO) {
4559 // Vector of 2:
4560 // Src = CurrLower
4561 // Curr = [CurrUpper, CurrLower]
4562 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4563 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4564 // Src = -OpLower
4565 return SrcStatus::IS_LOWER_HALF_NEG;
4566 }
4567 if (NegType == TypeClass::SCALAR) {
4568 // Scalar:
4569 // Src = CurrLower
4570 // Curr = [CurrUpper, CurrLower]
4571 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4572 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4573 // Src = OpLower
4574 return SrcStatus::IS_LOWER_HALF;
4575 }
4576 break;
4577 case SrcStatus::IS_UPPER_HALF_NEG:
4578 // Vector of 2:
4579 // Src = -CurrUpper
4580 // Curr = [CurrUpper, CurrLower]
4581 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4582 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4583 // Src = -(-OpUpper) = OpUpper
4584 //
4585 // Scalar:
4586 // Src = -CurrUpper
4587 // Curr = [CurrUpper, CurrLower]
4588 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4589 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4590 // Src = -(-OpUpper) = OpUpper
4591 return SrcStatus::IS_UPPER_HALF;
4592 case SrcStatus::IS_LOWER_HALF_NEG:
4593 if (NegType == TypeClass::VECTOR_OF_TWO) {
4594 // Vector of 2:
4595 // Src = -CurrLower
4596 // Curr = [CurrUpper, CurrLower]
4597 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4598 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4599 // Src = -(-OpLower) = OpLower
4600 return SrcStatus::IS_LOWER_HALF;
4601 }
4602 if (NegType == TypeClass::SCALAR) {
4603 // Scalar:
4604 // Src = -CurrLower
4605 // Curr = [CurrUpper, CurrLower]
4606 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4607 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4608 // Src = -OpLower
4609 return SrcStatus::IS_LOWER_HALF_NEG;
4610 }
4611 break;
4612 default:
4613 break;
4614 }
4615 llvm_unreachable("unexpected SrcStatus & NegType combination");
4616}
4617
4618static std::optional<std::pair<Register, SrcStatus>>
4619calcNextStatus(std::pair<Register, SrcStatus> Curr,
4620 const MachineRegisterInfo &MRI) {
4621 const MachineInstr *MI = MRI.getVRegDef(Curr.first);
4622
4623 unsigned Opc = MI->getOpcode();
4624
4625 // Handle general Opc cases.
4626 switch (Opc) {
4627 case AMDGPU::G_BITCAST:
4628 return std::optional<std::pair<Register, SrcStatus>>(
4629 {MI->getOperand(1).getReg(), Curr.second});
4630 case AMDGPU::COPY:
4631 if (MI->getOperand(1).getReg().isPhysical())
4632 return std::nullopt;
4633 return std::optional<std::pair<Register, SrcStatus>>(
4634 {MI->getOperand(1).getReg(), Curr.second});
4635 case AMDGPU::G_FNEG: {
4636 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
4637 if (Stat == SrcStatus::INVALID)
4638 return std::nullopt;
4639 return std::optional<std::pair<Register, SrcStatus>>(
4640 {MI->getOperand(1).getReg(), Stat});
4641 }
4642 default:
4643 break;
4644 }
4645
4646 // Calc next Stat from current Stat.
4647 switch (Curr.second) {
4648 case SrcStatus::IS_SAME:
4649 if (isTruncHalf(MI, MRI))
4650 return std::optional<std::pair<Register, SrcStatus>>(
4651 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4652 else if (isUnmergeHalf(MI, MRI)) {
4653 if (Curr.first == MI->getOperand(0).getReg())
4654 return std::optional<std::pair<Register, SrcStatus>>(
4655 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
4656 return std::optional<std::pair<Register, SrcStatus>>(
4657 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
4658 }
4659 break;
4660 case SrcStatus::IS_HI_NEG:
4661 if (isTruncHalf(MI, MRI)) {
4662 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4663 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
4664 // = [OpLowerHi, OpLowerLo]
4665 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4666 // = [-OpLowerHi, OpLowerLo]
4667 // = -OpLower
4668 return std::optional<std::pair<Register, SrcStatus>>(
4669 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4670 }
4671 if (isUnmergeHalf(MI, MRI)) {
4672 if (Curr.first == MI->getOperand(0).getReg())
4673 return std::optional<std::pair<Register, SrcStatus>>(
4674 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4675 return std::optional<std::pair<Register, SrcStatus>>(
4676 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4677 }
4678 break;
4679 case SrcStatus::IS_UPPER_HALF:
4680 if (isShlHalf(MI, MRI))
4681 return std::optional<std::pair<Register, SrcStatus>>(
4682 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4683 break;
4684 case SrcStatus::IS_LOWER_HALF:
4685 if (isLshrHalf(MI, MRI))
4686 return std::optional<std::pair<Register, SrcStatus>>(
4687 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
4688 break;
4689 case SrcStatus::IS_UPPER_HALF_NEG:
4690 if (isShlHalf(MI, MRI))
4691 return std::optional<std::pair<Register, SrcStatus>>(
4692 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4693 break;
4694 case SrcStatus::IS_LOWER_HALF_NEG:
4695 if (isLshrHalf(MI, MRI))
4696 return std::optional<std::pair<Register, SrcStatus>>(
4697 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4698 break;
4699 default:
4700 break;
4701 }
4702 return std::nullopt;
4703}
4704
4705/// This is used to control valid status that current MI supports. For example,
4706/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
4707/// bit on VOP3P.
4708/// The class can be further extended to recognize support on SEL, NEG, ABS bit
4709/// for different MI on different arch
4711private:
4712 bool HasNeg = false;
4713 // Assume all complex pattern of VOP3P have opsel.
4714 bool HasOpsel = true;
4715
4716public:
4718 const MachineInstr *MI = MRI.getVRegDef(Reg);
4719 unsigned Opc = MI->getOpcode();
4720
4721 if (Opc < TargetOpcode::GENERIC_OP_END) {
4722 // Keep same for generic op.
4723 HasNeg = true;
4724 } else if (Opc == TargetOpcode::G_INTRINSIC) {
4725 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
4726 // Only float point intrinsic has neg & neg_hi bits.
4727 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
4728 HasNeg = true;
4729 }
4730 }
4731 bool checkOptions(SrcStatus Stat) const {
4732 if (!HasNeg &&
4733 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
4734 return false;
4735 }
4736 if (!HasOpsel &&
4737 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
4738 return false;
4739 }
4740 return true;
4741 }
4742};
4743
4746 int MaxDepth = 3) {
4747 int Depth = 0;
4748 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
4750
4751 while (Depth <= MaxDepth && Curr.has_value()) {
4752 Depth++;
4753 if (SO.checkOptions(Curr.value().second))
4754 Statlist.push_back(Curr.value());
4755 Curr = calcNextStatus(Curr.value(), MRI);
4756 }
4757
4758 return Statlist;
4759}
4760
4761static std::pair<Register, SrcStatus>
4763 int MaxDepth = 3) {
4764 int Depth = 0;
4765 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
4766 auto Curr = calcNextStatus(LastSameOrNeg, MRI);
4767
4768 while (Depth <= MaxDepth && Curr.has_value()) {
4769 Depth++;
4770 SrcStatus Stat = Curr.value().second;
4771 if (SO.checkOptions(Stat)) {
4772 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
4773 Stat == SrcStatus::IS_LO_NEG || Stat == SrcStatus::IS_BOTH_NEG)
4774 LastSameOrNeg = Curr.value();
4775 }
4776 Curr = calcNextStatus(Curr.value(), MRI);
4777 }
4778
4779 return LastSameOrNeg;
4780}
4781
4782static bool isSameBitWidth(Register Reg1, Register Reg2,
4783 const MachineRegisterInfo &MRI) {
4784 unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
4785 unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
4786 return Width1 == Width2;
4787}
4788
4789static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
4790 // SrcStatus::IS_LOWER_HALF remain 0.
4791 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
4792 Mods ^= SISrcMods::NEG_HI;
4793 Mods |= SISrcMods::OP_SEL_1;
4794 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
4795 Mods |= SISrcMods::OP_SEL_1;
4796 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
4797 Mods ^= SISrcMods::NEG_HI;
4798 else if (HiStat == SrcStatus::IS_HI_NEG)
4799 Mods ^= SISrcMods::NEG_HI;
4800
4801 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
4802 Mods ^= SISrcMods::NEG;
4803 Mods |= SISrcMods::OP_SEL_0;
4804 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
4805 Mods |= SISrcMods::OP_SEL_0;
4806 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
4807 Mods |= SISrcMods::NEG;
4808 else if (LoStat == SrcStatus::IS_HI_NEG)
4809 Mods ^= SISrcMods::NEG;
4810
4811 return Mods;
4812}
4813
4814static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
4815 Register RootReg, const SIInstrInfo &TII,
4816 const MachineRegisterInfo &MRI) {
4817 auto IsHalfState = [](SrcStatus S) {
4818 return S == SrcStatus::IS_UPPER_HALF || S == SrcStatus::IS_UPPER_HALF_NEG ||
4819 S == SrcStatus::IS_LOWER_HALF || S == SrcStatus::IS_LOWER_HALF_NEG;
4820 };
4821 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
4822 IsHalfState(HiStat);
4823}
4824
4825std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
4826 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
4827 unsigned Mods = 0;
4828 // No modification if Root type is not form of <2 x Type>.
4829 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
4830 Mods |= SISrcMods::OP_SEL_1;
4831 return {RootReg, Mods};
4832 }
4833
4834 SearchOptions SO(RootReg, MRI);
4835
4836 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
4837
4838 if (Stat.second == SrcStatus::IS_BOTH_NEG)
4840 else if (Stat.second == SrcStatus::IS_HI_NEG)
4841 Mods ^= SISrcMods::NEG_HI;
4842 else if (Stat.second == SrcStatus::IS_LO_NEG)
4843 Mods ^= SISrcMods::NEG;
4844
4845 MachineInstr *MI = MRI.getVRegDef(Stat.first);
4846
4847 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
4848 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
4849 Mods |= SISrcMods::OP_SEL_1;
4850 return {Stat.first, Mods};
4851 }
4852
4854 getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
4855
4856 if (StatlistHi.empty()) {
4857 Mods |= SISrcMods::OP_SEL_1;
4858 return {Stat.first, Mods};
4859 }
4860
4862 getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
4863
4864 if (StatlistLo.empty()) {
4865 Mods |= SISrcMods::OP_SEL_1;
4866 return {Stat.first, Mods};
4867 }
4868
4869 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
4870 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
4871 if (StatlistHi[I].first == StatlistLo[J].first &&
4872 isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
4873 StatlistHi[I].first, RootReg, TII, MRI))
4874 return {StatlistHi[I].first,
4875 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
4876 }
4877 }
4878 // Packed instructions do not have abs modifiers.
4879 Mods |= SISrcMods::OP_SEL_1;
4880
4881 return {Stat.first, Mods};
4882}
4883
4884// Removed unused function `getAllKindImm` to eliminate dead code.
4885
4886static bool checkRB(Register Reg, unsigned int RBNo,
4887 const AMDGPURegisterBankInfo &RBI,
4888 const MachineRegisterInfo &MRI,
4889 const TargetRegisterInfo &TRI) {
4890 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
4891 return RB->getID() == RBNo;
4892}
4893
4894// This function is used to get the correct register bank for returned reg.
4895// Assume:
4896// 1. VOP3P is always legal for VGPR.
4897// 2. RootOp's regbank is legal.
4898// Thus
4899// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
4900// 2. If RootOp is VGPR, then NewOp must be VGPR.
4902 const AMDGPURegisterBankInfo &RBI,
4904 const TargetRegisterInfo &TRI,
4905 const SIInstrInfo &TII) {
4906 // RootOp can only be VGPR or SGPR (some hand written cases such as.
4907 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
4908 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
4909 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
4910 return NewReg;
4911
4912 MachineInstr *MI = MRI.getVRegDef(RootReg);
4913 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
4914 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
4915 return RootReg;
4916 }
4917
4918 MachineBasicBlock *BB = MI->getParent();
4919 Register DstReg = MRI.cloneVirtualRegister(RootReg);
4920
4922 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
4923 .addReg(NewReg);
4924
4925 // Only accept VGPR.
4926 return MIB->getOperand(0).getReg();
4927}
4928
4930AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
4931 bool IsDOT) const {
4933 Register Reg;
4934 unsigned Mods;
4935 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
4936
4937 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
4938 return {{
4939 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4940 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4941 }};
4942}
4943
4945AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
4946
4947 return selectVOP3PRetHelper(Root);
4948}
4949
4951AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
4952
4953 return selectVOP3PRetHelper(Root, true);
4954}
4955
4957AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
4958 MachineOperand &Root) const {
4959 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4960 "expected i1 value");
4961 unsigned Mods = SISrcMods::OP_SEL_1;
4962 if (Root.getImm() != 0)
4963 Mods |= SISrcMods::OP_SEL_0;
4964
4965 return {{
4966 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4967 }};
4968}
4969
4971 MachineInstr *InsertPt,
4973 const TargetRegisterClass *DstRegClass;
4974 switch (Elts.size()) {
4975 case 8:
4976 DstRegClass = &AMDGPU::VReg_256RegClass;
4977 break;
4978 case 4:
4979 DstRegClass = &AMDGPU::VReg_128RegClass;
4980 break;
4981 case 2:
4982 DstRegClass = &AMDGPU::VReg_64RegClass;
4983 break;
4984 default:
4985 llvm_unreachable("unhandled Reg sequence size");
4986 }
4987
4988 MachineIRBuilder B(*InsertPt);
4989 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
4990 .addDef(MRI.createVirtualRegister(DstRegClass));
4991 for (unsigned i = 0; i < Elts.size(); ++i) {
4992 MIB.addReg(Elts[i]);
4994 }
4995 return MIB->getOperand(0).getReg();
4996}
4997
4998static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5000 MachineInstr *InsertPt,
5002 if (ModOpcode == TargetOpcode::G_FNEG) {
5003 Mods |= SISrcMods::NEG;
5004 // Check if all elements also have abs modifier
5005 SmallVector<Register, 8> NegAbsElts;
5006 for (auto El : Elts) {
5007 Register FabsSrc;
5008 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
5009 break;
5010 NegAbsElts.push_back(FabsSrc);
5011 }
5012 if (Elts.size() != NegAbsElts.size()) {
5013 // Neg
5014 Src = buildRegSequence(Elts, InsertPt, MRI);
5015 } else {
5016 // Neg and Abs
5017 Mods |= SISrcMods::NEG_HI;
5018 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
5019 }
5020 } else {
5021 assert(ModOpcode == TargetOpcode::G_FABS);
5022 // Abs
5023 Mods |= SISrcMods::NEG_HI;
5024 Src = buildRegSequence(Elts, InsertPt, MRI);
5025 }
5026}
5027
5029AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5030 Register Src = Root.getReg();
5031 unsigned Mods = SISrcMods::OP_SEL_1;
5033
5034 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5035 assert(BV->getNumSources() > 0);
5036 // Based on first element decide which mod we match, neg or abs
5037 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5038 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5039 ? AMDGPU::G_FNEG
5040 : AMDGPU::G_FABS;
5041 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5042 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5043 if (ElF32->getOpcode() != ModOpcode)
5044 break;
5045 EltsF32.push_back(ElF32->getOperand(1).getReg());
5046 }
5047
5048 // All elements had ModOpcode modifier
5049 if (BV->getNumSources() == EltsF32.size()) {
5050 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5051 *MRI);
5052 }
5053 }
5054
5055 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5056 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5057}
5058
5060AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5061 Register Src = Root.getReg();
5062 unsigned Mods = SISrcMods::OP_SEL_1;
5063 SmallVector<Register, 8> EltsV2F16;
5064
5065 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5066 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5067 Register FNegSrc;
5068 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5069 break;
5070 EltsV2F16.push_back(FNegSrc);
5071 }
5072
5073 // All elements had ModOpcode modifier
5074 if (CV->getNumSources() == EltsV2F16.size()) {
5075 Mods |= SISrcMods::NEG;
5076 Mods |= SISrcMods::NEG_HI;
5077 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5078 }
5079 }
5080
5081 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5082 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5083}
5084
5086AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5087 Register Src = Root.getReg();
5088 unsigned Mods = SISrcMods::OP_SEL_1;
5089 SmallVector<Register, 8> EltsV2F16;
5090
5091 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5092 assert(CV->getNumSources() > 0);
5093 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5094 // Based on first element decide which mod we match, neg or abs
5095 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5096 ? AMDGPU::G_FNEG
5097 : AMDGPU::G_FABS;
5098
5099 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5100 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5101 if (ElV2F16->getOpcode() != ModOpcode)
5102 break;
5103 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5104 }
5105
5106 // All elements had ModOpcode modifier
5107 if (CV->getNumSources() == EltsV2F16.size()) {
5108 MachineIRBuilder B(*Root.getParent());
5109 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5110 *MRI);
5111 }
5112 }
5113
5114 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5115 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5116}
5117
5119AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5120 std::optional<FPValueAndVReg> FPValReg;
5121 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5122 if (TII.isInlineConstant(FPValReg->Value)) {
5123 return {{[=](MachineInstrBuilder &MIB) {
5124 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5125 }}};
5126 }
5127 // Non-inlineable splat floats should not fall-through for integer immediate
5128 // checks.
5129 return {};
5130 }
5131
5132 APInt ICst;
5133 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5134 if (TII.isInlineConstant(ICst)) {
5135 return {
5136 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5137 }
5138 }
5139
5140 return {};
5141}
5142
5144AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5145 Register Src =
5146 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5147 unsigned Key = 0;
5148
5149 Register ShiftSrc;
5150 std::optional<ValueAndVReg> ShiftAmt;
5151 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5152 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5153 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5154 Key = ShiftAmt->Value.getZExtValue() / 8;
5155 Src = ShiftSrc;
5156 }
5157
5158 return {{
5159 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5160 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5161 }};
5162}
5163
5165AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5166
5167 Register Src =
5168 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5169 unsigned Key = 0;
5170
5171 Register ShiftSrc;
5172 std::optional<ValueAndVReg> ShiftAmt;
5173 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5174 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5175 ShiftAmt->Value.getZExtValue() == 16) {
5176 Src = ShiftSrc;
5177 Key = 1;
5178 }
5179
5180 return {{
5181 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5182 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5183 }};
5184}
5185
5187AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5188 Register Src =
5189 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5190 unsigned Key = 0;
5191
5192 Register S32 = matchZeroExtendFromS32(Src);
5193 if (!S32)
5194 S32 = matchAnyExtendFromS32(Src);
5195
5196 if (S32) {
5197 const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
5198 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5199 assert(Def->getNumOperands() == 3);
5200 Register DstReg1 = Def->getOperand(1).getReg();
5201 if (mi_match(S32, *MRI,
5202 m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
5203 Src = Def->getOperand(2).getReg();
5204 Key = 1;
5205 }
5206 }
5207 }
5208
5209 return {{
5210 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5211 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5212 }};
5213}
5214
5216AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5217 Register Src;
5218 unsigned Mods;
5219 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5220
5221 // FIXME: Handle op_sel
5222 return {{
5223 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5224 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5225 }};
5226}
5227
5228// FIXME-TRUE16 remove when fake16 is removed
5230AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5231 Register Src;
5232 unsigned Mods;
5233 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5234 /*IsCanonicalizing=*/true,
5235 /*AllowAbs=*/false,
5236 /*OpSel=*/false);
5237
5238 return {{
5239 [=](MachineInstrBuilder &MIB) {
5240 MIB.addReg(
5241 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5242 },
5243 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5244 }};
5245}
5246
5248AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5249 Register Src;
5250 unsigned Mods;
5251 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5252 /*IsCanonicalizing=*/true,
5253 /*AllowAbs=*/false,
5254 /*OpSel=*/true);
5255
5256 return {{
5257 [=](MachineInstrBuilder &MIB) {
5258 MIB.addReg(
5259 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5260 },
5261 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5262 }};
5263}
5264
5265// Given \p Offset and load specified by the \p Root operand check if \p Offset
5266// is a multiple of the load byte size. If it is update \p Offset to a
5267// pre-scaled value and return true.
5268bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5270 bool IsSigned) const {
5271 if (!Subtarget->hasScaleOffset())
5272 return false;
5273
5274 const MachineInstr &MI = *Root.getParent();
5275 MachineMemOperand *MMO = *MI.memoperands_begin();
5276
5277 if (!MMO->getSize().hasValue())
5278 return false;
5279
5280 uint64_t Size = MMO->getSize().getValue();
5281
5282 Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
5283 if (!OffsetReg)
5284 OffsetReg = Offset;
5285
5286 if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
5287 OffsetReg = Def->Reg;
5288
5289 Register Op0;
5291 bool ScaleOffset =
5292 (isPowerOf2_64(Size) &&
5293 mi_match(OffsetReg, *MRI,
5294 m_GShl(m_Reg(Op0),
5297 mi_match(OffsetReg, *MRI,
5299 m_Copy(m_SpecificICst(Size))))) ||
5300 mi_match(
5301 OffsetReg, *MRI,
5302 m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5303 m_Reg(Op0), m_SpecificICst(Size))) ||
5304 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5305 (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
5306 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5307 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5308 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5309 VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
5310 mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
5311 mi_match(Mul->getOperand(3).getReg(), *MRI,
5313 m_Copy(m_SpecificICst(Size))))) &&
5314 mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
5315
5316 if (ScaleOffset)
5317 Offset = Op0;
5318
5319 return ScaleOffset;
5320}
5321
5322bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5323 Register &Base,
5324 Register *SOffset,
5325 int64_t *Offset,
5326 bool *ScaleOffset) const {
5327 MachineInstr *MI = Root.getParent();
5328 MachineBasicBlock *MBB = MI->getParent();
5329
5330 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5331 // then we can select all ptr + 32-bit offsets.
5332 SmallVector<GEPInfo, 4> AddrInfo;
5333 getAddrModeInfo(*MI, *MRI, AddrInfo);
5334
5335 if (AddrInfo.empty())
5336 return false;
5337
5338 const GEPInfo &GEPI = AddrInfo[0];
5339 std::optional<int64_t> EncodedImm;
5340
5341 if (ScaleOffset)
5342 *ScaleOffset = false;
5343
5344 if (SOffset && Offset) {
5345 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5346 /*HasSOffset=*/true);
5347 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5348 AddrInfo.size() > 1) {
5349 const GEPInfo &GEPI2 = AddrInfo[1];
5350 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5351 Register OffsetReg = GEPI2.SgprParts[1];
5352 if (ScaleOffset)
5353 *ScaleOffset =
5354 selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5355 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5356 if (OffsetReg) {
5357 Base = GEPI2.SgprParts[0];
5358 *SOffset = OffsetReg;
5359 *Offset = *EncodedImm;
5360 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5361 return true;
5362
5363 // For unbuffered smem loads, it is illegal for the Immediate Offset
5364 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5365 // is negative. Handle the case where the Immediate Offset + SOffset
5366 // is negative.
5367 auto SKnown = VT->getKnownBits(*SOffset);
5368 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5369 return false;
5370
5371 return true;
5372 }
5373 }
5374 }
5375 return false;
5376 }
5377
5378 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5379 /*HasSOffset=*/false);
5380 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5381 Base = GEPI.SgprParts[0];
5382 *Offset = *EncodedImm;
5383 return true;
5384 }
5385
5386 // SGPR offset is unsigned.
5387 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5388 GEPI.Imm != 0) {
5389 // If we make it this far we have a load with an 32-bit immediate offset.
5390 // It is OK to select this using a sgpr offset, because we have already
5391 // failed trying to select this load into one of the _IMM variants since
5392 // the _IMM Patterns are considered before the _SGPR patterns.
5393 Base = GEPI.SgprParts[0];
5394 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5395 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5396 .addImm(GEPI.Imm);
5397 return true;
5398 }
5399
5400 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5401 Register OffsetReg = GEPI.SgprParts[1];
5402 if (ScaleOffset)
5403 *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5404 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5405 if (OffsetReg) {
5406 Base = GEPI.SgprParts[0];
5407 *SOffset = OffsetReg;
5408 return true;
5409 }
5410 }
5411
5412 return false;
5413}
5414
5416AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5417 Register Base;
5418 int64_t Offset;
5419 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
5420 /* ScaleOffset */ nullptr))
5421 return std::nullopt;
5422
5423 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5424 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5425}
5426
5428AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5429 SmallVector<GEPInfo, 4> AddrInfo;
5430 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5431
5432 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5433 return std::nullopt;
5434
5435 const GEPInfo &GEPInfo = AddrInfo[0];
5436 Register PtrReg = GEPInfo.SgprParts[0];
5437 std::optional<int64_t> EncodedImm =
5438 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5439 if (!EncodedImm)
5440 return std::nullopt;
5441
5442 return {{
5443 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
5444 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
5445 }};
5446}
5447
5449AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5450 Register Base, SOffset;
5451 bool ScaleOffset;
5452 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
5453 &ScaleOffset))
5454 return std::nullopt;
5455
5456 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5457 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5458 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5459 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5460}
5461
5463AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5464 Register Base, SOffset;
5465 int64_t Offset;
5466 bool ScaleOffset;
5467 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
5468 return std::nullopt;
5469
5470 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5471 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5472 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5473 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5474 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5475}
5476
5477std::pair<Register, int>
5478AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
5479 uint64_t FlatVariant) const {
5480 MachineInstr *MI = Root.getParent();
5481
5482 auto Default = std::pair(Root.getReg(), 0);
5483
5484 if (!STI.hasFlatInstOffsets())
5485 return Default;
5486
5487 Register PtrBase;
5488 int64_t ConstOffset;
5489 bool IsInBounds;
5490 std::tie(PtrBase, ConstOffset, IsInBounds) =
5491 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5492
5493 // Adding the offset to the base address with an immediate in a FLAT
5494 // instruction must not change the memory aperture in which the address falls.
5495 // Therefore we can only fold offsets from inbounds GEPs into FLAT
5496 // instructions.
5497 if (ConstOffset == 0 ||
5498 (FlatVariant == SIInstrFlags::FlatScratch &&
5499 !isFlatScratchBaseLegal(Root.getReg())) ||
5500 (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
5501 return Default;
5502
5503 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5504 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
5505 return Default;
5506
5507 return std::pair(PtrBase, ConstOffset);
5508}
5509
5511AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5512 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
5513
5514 return {{
5515 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5516 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5517 }};
5518}
5519
5521AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5522 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
5523
5524 return {{
5525 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5526 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5527 }};
5528}
5529
5531AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5532 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
5533
5534 return {{
5535 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5536 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5537 }};
5538}
5539
5540// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
5542AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
5543 unsigned CPolBits,
5544 bool NeedIOffset) const {
5545 Register Addr = Root.getReg();
5546 Register PtrBase;
5547 int64_t ConstOffset;
5548 int64_t ImmOffset = 0;
5549
5550 // Match the immediate offset first, which canonically is moved as low as
5551 // possible.
5552 std::tie(PtrBase, ConstOffset, std::ignore) =
5553 getPtrBaseWithConstantOffset(Addr, *MRI);
5554
5555 if (ConstOffset != 0) {
5556 if (NeedIOffset &&
5559 Addr = PtrBase;
5560 ImmOffset = ConstOffset;
5561 } else {
5562 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
5563 if (isSGPR(PtrBaseDef->Reg)) {
5564 if (ConstOffset > 0) {
5565 // Offset is too large.
5566 //
5567 // saddr + large_offset -> saddr +
5568 // (voffset = large_offset & ~MaxOffset) +
5569 // (large_offset & MaxOffset);
5570 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
5571 if (NeedIOffset) {
5572 std::tie(SplitImmOffset, RemainderOffset) =
5575 }
5576
5577 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
5578 : isUInt<32>(RemainderOffset)) {
5579 MachineInstr *MI = Root.getParent();
5580 MachineBasicBlock *MBB = MI->getParent();
5581 Register HighBits =
5582 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5583
5584 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5585 HighBits)
5586 .addImm(RemainderOffset);
5587
5588 if (NeedIOffset)
5589 return {{
5590 [=](MachineInstrBuilder &MIB) {
5591 MIB.addReg(PtrBase);
5592 }, // saddr
5593 [=](MachineInstrBuilder &MIB) {
5594 MIB.addReg(HighBits);
5595 }, // voffset
5596 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
5597 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5598 }};
5599 return {{
5600 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
5601 [=](MachineInstrBuilder &MIB) {
5602 MIB.addReg(HighBits);
5603 }, // voffset
5604 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5605 }};
5606 }
5607 }
5608
5609 // We are adding a 64 bit SGPR and a constant. If constant bus limit
5610 // is 1 we would need to perform 1 or 2 extra moves for each half of
5611 // the constant and it is better to do a scalar add and then issue a
5612 // single VALU instruction to materialize zero. Otherwise it is less
5613 // instructions to perform VALU adds with immediates or inline literals.
5614 unsigned NumLiterals =
5615 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
5616 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
5617 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
5618 return std::nullopt;
5619 }
5620 }
5621 }
5622
5623 // Match the variable offset.
5624 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5625 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5626 // Look through the SGPR->VGPR copy.
5627 Register SAddr =
5628 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
5629
5630 if (isSGPR(SAddr)) {
5631 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
5632
5633 // It's possible voffset is an SGPR here, but the copy to VGPR will be
5634 // inserted later.
5635 bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
5636 Subtarget->hasSignedGVSOffset());
5637 if (Register VOffset = matchExtendFromS32OrS32(
5638 PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
5639 if (NeedIOffset)
5640 return {{[=](MachineInstrBuilder &MIB) { // saddr
5641 MIB.addReg(SAddr);
5642 },
5643 [=](MachineInstrBuilder &MIB) { // voffset
5644 MIB.addReg(VOffset);
5645 },
5646 [=](MachineInstrBuilder &MIB) { // offset
5647 MIB.addImm(ImmOffset);
5648 },
5649 [=](MachineInstrBuilder &MIB) { // cpol
5650 MIB.addImm(CPolBits |
5651 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5652 }}};
5653 return {{[=](MachineInstrBuilder &MIB) { // saddr
5654 MIB.addReg(SAddr);
5655 },
5656 [=](MachineInstrBuilder &MIB) { // voffset
5657 MIB.addReg(VOffset);
5658 },
5659 [=](MachineInstrBuilder &MIB) { // cpol
5660 MIB.addImm(CPolBits |
5661 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5662 }}};
5663 }
5664 }
5665 }
5666
5667 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
5668 // drop this.
5669 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
5670 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
5671 return std::nullopt;
5672
5673 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
5674 // moves required to copy a 64-bit SGPR to VGPR.
5675 MachineInstr *MI = Root.getParent();
5676 MachineBasicBlock *MBB = MI->getParent();
5677 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5678
5679 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
5680 .addImm(0);
5681
5682 if (NeedIOffset)
5683 return {{
5684 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
5685 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
5686 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5687 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
5688 }};
5689 return {{
5690 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
5691 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
5692 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
5693 }};
5694}
5695
5697AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
5698 return selectGlobalSAddr(Root, 0);
5699}
5700
5702AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
5703 const MachineInstr &I = *Root.getParent();
5704
5705 // We are assuming CPol is always the last operand of the intrinsic.
5706 auto PassedCPol =
5707 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5708 return selectGlobalSAddr(Root, PassedCPol);
5709}
5710
5712AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
5713 return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
5714}
5715
5717AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
5718 MachineOperand &Root) const {
5719 const MachineInstr &I = *Root.getParent();
5720
5721 // We are assuming CPol is always the last operand of the intrinsic.
5722 auto PassedCPol =
5723 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5724 return selectGlobalSAddr(Root, PassedCPol, false);
5725}
5726
5728AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
5729 Register Addr = Root.getReg();
5730 Register PtrBase;
5731 int64_t ConstOffset;
5732 int64_t ImmOffset = 0;
5733
5734 // Match the immediate offset first, which canonically is moved as low as
5735 // possible.
5736 std::tie(PtrBase, ConstOffset, std::ignore) =
5737 getPtrBaseWithConstantOffset(Addr, *MRI);
5738
5739 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
5742 Addr = PtrBase;
5743 ImmOffset = ConstOffset;
5744 }
5745
5746 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5747 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5748 int FI = AddrDef->MI->getOperand(1).getIndex();
5749 return {{
5750 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5751 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5752 }};
5753 }
5754
5755 Register SAddr = AddrDef->Reg;
5756
5757 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5758 Register LHS = AddrDef->MI->getOperand(1).getReg();
5759 Register RHS = AddrDef->MI->getOperand(2).getReg();
5760 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5761 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
5762
5763 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
5764 isSGPR(RHSDef->Reg)) {
5765 int FI = LHSDef->MI->getOperand(1).getIndex();
5766 MachineInstr &I = *Root.getParent();
5767 MachineBasicBlock *BB = I.getParent();
5768 const DebugLoc &DL = I.getDebugLoc();
5769 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5770
5771 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
5772 .addFrameIndex(FI)
5773 .addReg(RHSDef->Reg)
5774 .setOperandDead(3); // Dead scc
5775 }
5776 }
5777
5778 if (!isSGPR(SAddr))
5779 return std::nullopt;
5780
5781 return {{
5782 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
5783 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5784 }};
5785}
5786
5787// Check whether the flat scratch SVS swizzle bug affects this access.
5788bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
5789 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
5790 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
5791 return false;
5792
5793 // The bug affects the swizzling of SVS accesses if there is any carry out
5794 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
5795 // voffset to (soffset + inst_offset).
5796 auto VKnown = VT->getKnownBits(VAddr);
5797 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
5798 KnownBits::makeConstant(APInt(32, ImmOffset)));
5799 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
5800 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
5801 return (VMax & 3) + (SMax & 3) >= 4;
5802}
5803
5805AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
5806 Register Addr = Root.getReg();
5807 Register PtrBase;
5808 int64_t ConstOffset;
5809 int64_t ImmOffset = 0;
5810
5811 // Match the immediate offset first, which canonically is moved as low as
5812 // possible.
5813 std::tie(PtrBase, ConstOffset, std::ignore) =
5814 getPtrBaseWithConstantOffset(Addr, *MRI);
5815
5816 Register OrigAddr = Addr;
5817 if (ConstOffset != 0 &&
5820 Addr = PtrBase;
5821 ImmOffset = ConstOffset;
5822 }
5823
5824 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5825 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
5826 return std::nullopt;
5827
5828 Register RHS = AddrDef->MI->getOperand(2).getReg();
5829 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
5830 return std::nullopt;
5831
5832 Register LHS = AddrDef->MI->getOperand(1).getReg();
5833 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5834
5835 if (OrigAddr != Addr) {
5836 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
5837 return std::nullopt;
5838 } else {
5839 if (!isFlatScratchBaseLegalSV(OrigAddr))
5840 return std::nullopt;
5841 }
5842
5843 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
5844 return std::nullopt;
5845
5846 unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
5848 : 0;
5849
5850 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5851 int FI = LHSDef->MI->getOperand(1).getIndex();
5852 return {{
5853 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5854 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5855 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5856 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
5857 }};
5858 }
5859
5860 if (!isSGPR(LHS))
5861 if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
5862 LHS = Def->Reg;
5863
5864 if (!isSGPR(LHS))
5865 return std::nullopt;
5866
5867 return {{
5868 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5869 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
5870 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5871 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
5872 }};
5873}
5874
5876AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
5877 MachineInstr *MI = Root.getParent();
5878 MachineBasicBlock *MBB = MI->getParent();
5881
5882 int64_t Offset = 0;
5883 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
5885 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5886
5887 // TODO: Should this be inside the render function? The iterator seems to
5888 // move.
5889 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
5890 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5891 HighBits)
5892 .addImm(Offset & ~MaxOffset);
5893
5894 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5895 MIB.addReg(Info->getScratchRSrcReg());
5896 },
5897 [=](MachineInstrBuilder &MIB) { // vaddr
5898 MIB.addReg(HighBits);
5899 },
5900 [=](MachineInstrBuilder &MIB) { // soffset
5901 // Use constant zero for soffset and rely on eliminateFrameIndex
5902 // to choose the appropriate frame register if need be.
5903 MIB.addImm(0);
5904 },
5905 [=](MachineInstrBuilder &MIB) { // offset
5906 MIB.addImm(Offset & MaxOffset);
5907 }}};
5908 }
5909
5910 assert(Offset == 0 || Offset == -1);
5911
5912 // Try to fold a frame index directly into the MUBUF vaddr field, and any
5913 // offsets.
5914 std::optional<int> FI;
5915 Register VAddr = Root.getReg();
5916
5917 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5918 Register PtrBase;
5919 int64_t ConstOffset;
5920 std::tie(PtrBase, ConstOffset, std::ignore) =
5921 getPtrBaseWithConstantOffset(VAddr, *MRI);
5922 if (ConstOffset != 0) {
5923 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
5925 VT->signBitIsZero(PtrBase))) {
5926 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
5927 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
5928 FI = PtrBaseDef->getOperand(1).getIndex();
5929 else
5930 VAddr = PtrBase;
5931 Offset = ConstOffset;
5932 }
5933 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5934 FI = RootDef->getOperand(1).getIndex();
5935 }
5936
5937 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5938 MIB.addReg(Info->getScratchRSrcReg());
5939 },
5940 [=](MachineInstrBuilder &MIB) { // vaddr
5941 if (FI)
5942 MIB.addFrameIndex(*FI);
5943 else
5944 MIB.addReg(VAddr);
5945 },
5946 [=](MachineInstrBuilder &MIB) { // soffset
5947 // Use constant zero for soffset and rely on eliminateFrameIndex
5948 // to choose the appropriate frame register if need be.
5949 MIB.addImm(0);
5950 },
5951 [=](MachineInstrBuilder &MIB) { // offset
5952 MIB.addImm(Offset);
5953 }}};
5954}
5955
5956bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
5957 int64_t Offset) const {
5958 if (!isUInt<16>(Offset))
5959 return false;
5960
5962 return true;
5963
5964 // On Southern Islands instruction with a negative base value and an offset
5965 // don't seem to work.
5966 return VT->signBitIsZero(Base);
5967}
5968
5969bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
5970 int64_t Offset1,
5971 unsigned Size) const {
5972 if (Offset0 % Size != 0 || Offset1 % Size != 0)
5973 return false;
5974 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
5975 return false;
5976
5978 return true;
5979
5980 // On Southern Islands instruction with a negative base value and an offset
5981 // don't seem to work.
5982 return VT->signBitIsZero(Base);
5983}
5984
5985// Return whether the operation has NoUnsignedWrap property.
5987 return Addr->getOpcode() == TargetOpcode::G_OR ||
5988 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
5989 Addr->getFlag(MachineInstr::NoUWrap));
5990}
5991
5992// Check that the base address of flat scratch load/store in the form of `base +
5993// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
5994// requirement). We always treat the first operand as the base address here.
5995bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
5996 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5997
5998 if (isNoUnsignedWrap(AddrMI))
5999 return true;
6000
6001 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6002 // values.
6003 if (STI.hasSignedScratchOffsets())
6004 return true;
6005
6006 Register LHS = AddrMI->getOperand(1).getReg();
6007 Register RHS = AddrMI->getOperand(2).getReg();
6008
6009 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6010 std::optional<ValueAndVReg> RhsValReg =
6012 // If the immediate offset is negative and within certain range, the base
6013 // address cannot also be negative. If the base is also negative, the sum
6014 // would be either negative or much larger than the valid range of scratch
6015 // memory a thread can access.
6016 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6017 RhsValReg->Value.getSExtValue() > -0x40000000)
6018 return true;
6019 }
6020
6021 return VT->signBitIsZero(LHS);
6022}
6023
6024// Check address value in SGPR/VGPR are legal for flat scratch in the form
6025// of: SGPR + VGPR.
6026bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6027 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6028
6029 if (isNoUnsignedWrap(AddrMI))
6030 return true;
6031
6032 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6033 // values.
6034 if (STI.hasSignedScratchOffsets())
6035 return true;
6036
6037 Register LHS = AddrMI->getOperand(1).getReg();
6038 Register RHS = AddrMI->getOperand(2).getReg();
6039 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6040}
6041
6042// Check address value in SGPR/VGPR are legal for flat scratch in the form
6043// of: SGPR + VGPR + Imm.
6044bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6045 Register Addr) const {
6046 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6047 // values.
6048 if (STI.hasSignedScratchOffsets())
6049 return true;
6050
6051 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6052 Register Base = AddrMI->getOperand(1).getReg();
6053 std::optional<DefinitionAndSourceRegister> BaseDef =
6055 std::optional<ValueAndVReg> RHSOffset =
6057 assert(RHSOffset);
6058
6059 // If the immediate offset is negative and within certain range, the base
6060 // address cannot also be negative. If the base is also negative, the sum
6061 // would be either negative or much larger than the valid range of scratch
6062 // memory a thread can access.
6063 if (isNoUnsignedWrap(BaseDef->MI) &&
6064 (isNoUnsignedWrap(AddrMI) ||
6065 (RHSOffset->Value.getSExtValue() < 0 &&
6066 RHSOffset->Value.getSExtValue() > -0x40000000)))
6067 return true;
6068
6069 Register LHS = BaseDef->MI->getOperand(1).getReg();
6070 Register RHS = BaseDef->MI->getOperand(2).getReg();
6071 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6072}
6073
6074bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6075 unsigned ShAmtBits) const {
6076 assert(MI.getOpcode() == TargetOpcode::G_AND);
6077
6078 std::optional<APInt> RHS =
6079 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
6080 if (!RHS)
6081 return false;
6082
6083 if (RHS->countr_one() >= ShAmtBits)
6084 return true;
6085
6086 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
6087 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6088}
6089
6091AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6092 MachineOperand &Root) const {
6093 Register Reg = Root.getReg();
6095
6096 std::optional<DefinitionAndSourceRegister> Def =
6097 getDefSrcRegIgnoringCopies(Reg, *MRI);
6098 assert(Def && "this shouldn't be an optional result");
6099 Reg = Def->Reg;
6100
6101 if (Register WaveBase = getWaveAddress(Def->MI)) {
6102 return {{
6103 [=](MachineInstrBuilder &MIB) { // rsrc
6104 MIB.addReg(Info->getScratchRSrcReg());
6105 },
6106 [=](MachineInstrBuilder &MIB) { // soffset
6107 MIB.addReg(WaveBase);
6108 },
6109 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
6110 }};
6111 }
6112
6113 int64_t Offset = 0;
6114
6115 // FIXME: Copy check is a hack
6117 if (mi_match(Reg, *MRI,
6118 m_GPtrAdd(m_Reg(BasePtr),
6120 if (!TII.isLegalMUBUFImmOffset(Offset))
6121 return {};
6122 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
6123 Register WaveBase = getWaveAddress(BasePtrDef);
6124 if (!WaveBase)
6125 return {};
6126
6127 return {{
6128 [=](MachineInstrBuilder &MIB) { // rsrc
6129 MIB.addReg(Info->getScratchRSrcReg());
6130 },
6131 [=](MachineInstrBuilder &MIB) { // soffset
6132 MIB.addReg(WaveBase);
6133 },
6134 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6135 }};
6136 }
6137
6138 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
6140 return {};
6141
6142 return {{
6143 [=](MachineInstrBuilder &MIB) { // rsrc
6144 MIB.addReg(Info->getScratchRSrcReg());
6145 },
6146 [=](MachineInstrBuilder &MIB) { // soffset
6147 MIB.addImm(0);
6148 },
6149 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6150 }};
6151}
6152
6153std::pair<Register, unsigned>
6154AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6155 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6156 int64_t ConstAddr = 0;
6157
6158 Register PtrBase;
6159 int64_t Offset;
6160 std::tie(PtrBase, Offset, std::ignore) =
6161 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6162
6163 if (Offset) {
6164 if (isDSOffsetLegal(PtrBase, Offset)) {
6165 // (add n0, c0)
6166 return std::pair(PtrBase, Offset);
6167 }
6168 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6169 // TODO
6170
6171
6172 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6173 // TODO
6174
6175 }
6176
6177 return std::pair(Root.getReg(), 0);
6178}
6179
6181AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6182 Register Reg;
6183 unsigned Offset;
6184 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
6185 return {{
6186 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6187 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
6188 }};
6189}
6190
6192AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6193 return selectDSReadWrite2(Root, 4);
6194}
6195
6197AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6198 return selectDSReadWrite2(Root, 8);
6199}
6200
6202AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6203 unsigned Size) const {
6204 Register Reg;
6205 unsigned Offset;
6206 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
6207 return {{
6208 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6209 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6210 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
6211 }};
6212}
6213
6214std::pair<Register, unsigned>
6215AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6216 unsigned Size) const {
6217 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6218 int64_t ConstAddr = 0;
6219
6220 Register PtrBase;
6221 int64_t Offset;
6222 std::tie(PtrBase, Offset, std::ignore) =
6223 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6224
6225 if (Offset) {
6226 int64_t OffsetValue0 = Offset;
6227 int64_t OffsetValue1 = Offset + Size;
6228 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
6229 // (add n0, c0)
6230 return std::pair(PtrBase, OffsetValue0 / Size);
6231 }
6232 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6233 // TODO
6234
6235 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6236 // TODO
6237
6238 }
6239
6240 return std::pair(Root.getReg(), 0);
6241}
6242
6243/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6244/// the base value with the constant offset, and if the offset computation is
6245/// known to be inbounds. There may be intervening copies between \p Root and
6246/// the identified constant. Returns \p Root, 0, false if this does not match
6247/// the pattern.
6248std::tuple<Register, int64_t, bool>
6249AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6250 Register Root, const MachineRegisterInfo &MRI) const {
6251 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6252 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6253 return {Root, 0, false};
6254
6255 MachineOperand &RHS = RootI->getOperand(2);
6256 std::optional<ValueAndVReg> MaybeOffset =
6258 if (!MaybeOffset)
6259 return {Root, 0, false};
6260 bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
6261 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
6262 IsInBounds};
6263}
6264
6266 MIB.addImm(0);
6267}
6268
6269/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6270/// BasePtr is not valid, a null base pointer will be used.
6272 uint32_t FormatLo, uint32_t FormatHi,
6273 Register BasePtr) {
6274 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6275 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6276 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6277 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6278
6279 B.buildInstr(AMDGPU::S_MOV_B32)
6280 .addDef(RSrc2)
6281 .addImm(FormatLo);
6282 B.buildInstr(AMDGPU::S_MOV_B32)
6283 .addDef(RSrc3)
6284 .addImm(FormatHi);
6285
6286 // Build the half of the subregister with the constants before building the
6287 // full 128-bit register. If we are building multiple resource descriptors,
6288 // this will allow CSEing of the 2-component register.
6289 B.buildInstr(AMDGPU::REG_SEQUENCE)
6290 .addDef(RSrcHi)
6291 .addReg(RSrc2)
6292 .addImm(AMDGPU::sub0)
6293 .addReg(RSrc3)
6294 .addImm(AMDGPU::sub1);
6295
6296 Register RSrcLo = BasePtr;
6297 if (!BasePtr) {
6298 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6299 B.buildInstr(AMDGPU::S_MOV_B64)
6300 .addDef(RSrcLo)
6301 .addImm(0);
6302 }
6303
6304 B.buildInstr(AMDGPU::REG_SEQUENCE)
6305 .addDef(RSrc)
6306 .addReg(RSrcLo)
6307 .addImm(AMDGPU::sub0_sub1)
6308 .addReg(RSrcHi)
6309 .addImm(AMDGPU::sub2_sub3);
6310
6311 return RSrc;
6312}
6313
6315 const SIInstrInfo &TII, Register BasePtr) {
6316 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6317
6318 // FIXME: Why are half the "default" bits ignored based on the addressing
6319 // mode?
6320 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6321}
6322
6324 const SIInstrInfo &TII, Register BasePtr) {
6325 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6326
6327 // FIXME: Why are half the "default" bits ignored based on the addressing
6328 // mode?
6329 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6330}
6331
6332AMDGPUInstructionSelector::MUBUFAddressData
6333AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6334 MUBUFAddressData Data;
6335 Data.N0 = Src;
6336
6337 Register PtrBase;
6338 int64_t Offset;
6339
6340 std::tie(PtrBase, Offset, std::ignore) =
6341 getPtrBaseWithConstantOffset(Src, *MRI);
6342 if (isUInt<32>(Offset)) {
6343 Data.N0 = PtrBase;
6344 Data.Offset = Offset;
6345 }
6346
6347 if (MachineInstr *InputAdd
6348 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6349 Data.N2 = InputAdd->getOperand(1).getReg();
6350 Data.N3 = InputAdd->getOperand(2).getReg();
6351
6352 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6353 // FIXME: Don't know this was defined by operand 0
6354 //
6355 // TODO: Remove this when we have copy folding optimizations after
6356 // RegBankSelect.
6357 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6358 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6359 }
6360
6361 return Data;
6362}
6363
6364/// Return if the addr64 mubuf mode should be used for the given address.
6365bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6366 // (ptr_add N2, N3) -> addr64, or
6367 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6368 if (Addr.N2)
6369 return true;
6370
6371 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6372 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6373}
6374
6375/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6376/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6377/// component.
6378void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6379 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6380 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6381 return;
6382
6383 // Illegal offset, store it in soffset.
6384 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6385 B.buildInstr(AMDGPU::S_MOV_B32)
6386 .addDef(SOffset)
6387 .addImm(ImmOffset);
6388 ImmOffset = 0;
6389}
6390
6391bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6392 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6393 Register &SOffset, int64_t &Offset) const {
6394 // FIXME: Predicates should stop this from reaching here.
6395 // addr64 bit was removed for volcanic islands.
6396 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6397 return false;
6398
6399 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6400 if (!shouldUseAddr64(AddrData))
6401 return false;
6402
6403 Register N0 = AddrData.N0;
6404 Register N2 = AddrData.N2;
6405 Register N3 = AddrData.N3;
6406 Offset = AddrData.Offset;
6407
6408 // Base pointer for the SRD.
6409 Register SRDPtr;
6410
6411 if (N2) {
6412 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6413 assert(N3);
6414 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6415 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6416 // addr64, and construct the default resource from a 0 address.
6417 VAddr = N0;
6418 } else {
6419 SRDPtr = N3;
6420 VAddr = N2;
6421 }
6422 } else {
6423 // N2 is not divergent.
6424 SRDPtr = N2;
6425 VAddr = N3;
6426 }
6427 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6428 // Use the default null pointer in the resource
6429 VAddr = N0;
6430 } else {
6431 // N0 -> offset, or
6432 // (N0 + C1) -> offset
6433 SRDPtr = N0;
6434 }
6435
6436 MachineIRBuilder B(*Root.getParent());
6437 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
6438 splitIllegalMUBUFOffset(B, SOffset, Offset);
6439 return true;
6440}
6441
6442bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6443 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6444 int64_t &Offset) const {
6445
6446 // FIXME: Pattern should not reach here.
6447 if (STI.useFlatForGlobal())
6448 return false;
6449
6450 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6451 if (shouldUseAddr64(AddrData))
6452 return false;
6453
6454 // N0 -> offset, or
6455 // (N0 + C1) -> offset
6456 Register SRDPtr = AddrData.N0;
6457 Offset = AddrData.Offset;
6458
6459 // TODO: Look through extensions for 32-bit soffset.
6460 MachineIRBuilder B(*Root.getParent());
6461
6462 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
6463 splitIllegalMUBUFOffset(B, SOffset, Offset);
6464 return true;
6465}
6466
6468AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6469 Register VAddr;
6470 Register RSrcReg;
6471 Register SOffset;
6472 int64_t Offset = 0;
6473
6474 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6475 return {};
6476
6477 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6478 // pattern.
6479 return {{
6480 [=](MachineInstrBuilder &MIB) { // rsrc
6481 MIB.addReg(RSrcReg);
6482 },
6483 [=](MachineInstrBuilder &MIB) { // vaddr
6484 MIB.addReg(VAddr);
6485 },
6486 [=](MachineInstrBuilder &MIB) { // soffset
6487 if (SOffset)
6488 MIB.addReg(SOffset);
6489 else if (STI.hasRestrictedSOffset())
6490 MIB.addReg(AMDGPU::SGPR_NULL);
6491 else
6492 MIB.addImm(0);
6493 },
6494 [=](MachineInstrBuilder &MIB) { // offset
6495 MIB.addImm(Offset);
6496 },
6497 addZeroImm, // cpol
6498 addZeroImm, // tfe
6499 addZeroImm // swz
6500 }};
6501}
6502
6504AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6505 Register RSrcReg;
6506 Register SOffset;
6507 int64_t Offset = 0;
6508
6509 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6510 return {};
6511
6512 return {{
6513 [=](MachineInstrBuilder &MIB) { // rsrc
6514 MIB.addReg(RSrcReg);
6515 },
6516 [=](MachineInstrBuilder &MIB) { // soffset
6517 if (SOffset)
6518 MIB.addReg(SOffset);
6519 else if (STI.hasRestrictedSOffset())
6520 MIB.addReg(AMDGPU::SGPR_NULL);
6521 else
6522 MIB.addImm(0);
6523 },
6524 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
6525 addZeroImm, // cpol
6526 addZeroImm, // tfe
6527 addZeroImm, // swz
6528 }};
6529}
6530
6532AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
6533
6534 Register SOffset = Root.getReg();
6535
6536 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
6537 SOffset = AMDGPU::SGPR_NULL;
6538
6539 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
6540}
6541
6542/// Get an immediate that must be 32-bits, and treated as zero extended.
6543static std::optional<uint64_t>
6545 // getIConstantVRegVal sexts any values, so see if that matters.
6546 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
6547 if (!OffsetVal || !isInt<32>(*OffsetVal))
6548 return std::nullopt;
6549 return Lo_32(*OffsetVal);
6550}
6551
6553AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
6554 std::optional<uint64_t> OffsetVal =
6555 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
6556 if (!OffsetVal)
6557 return {};
6558
6559 std::optional<int64_t> EncodedImm =
6560 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
6561 if (!EncodedImm)
6562 return {};
6563
6564 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6565}
6566
6568AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
6570
6571 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
6572 if (!OffsetVal)
6573 return {};
6574
6575 std::optional<int64_t> EncodedImm =
6577 if (!EncodedImm)
6578 return {};
6579
6580 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6581}
6582
6584AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
6585 // Match the (soffset + offset) pair as a 32-bit register base and
6586 // an immediate offset.
6587 Register SOffset;
6588 unsigned Offset;
6589 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
6590 *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
6591 if (!SOffset)
6592 return std::nullopt;
6593
6594 std::optional<int64_t> EncodedOffset =
6595 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
6596 if (!EncodedOffset)
6597 return std::nullopt;
6598
6599 assert(MRI->getType(SOffset) == LLT::scalar(32));
6600 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
6601 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
6602}
6603
6604std::pair<Register, unsigned>
6605AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
6606 bool &Matched) const {
6607 Matched = false;
6608
6609 Register Src;
6610 unsigned Mods;
6611 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
6612
6613 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
6614 assert(MRI->getType(Src) == LLT::scalar(16));
6615
6616 // Only change Src if src modifier could be gained. In such cases new Src
6617 // could be sgpr but this does not violate constant bus restriction for
6618 // instruction that is being selected.
6619 Src = stripBitCast(Src, *MRI);
6620
6621 const auto CheckAbsNeg = [&]() {
6622 // Be careful about folding modifiers if we already have an abs. fneg is
6623 // applied last, so we don't want to apply an earlier fneg.
6624 if ((Mods & SISrcMods::ABS) == 0) {
6625 unsigned ModsTmp;
6626 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
6627
6628 if ((ModsTmp & SISrcMods::NEG) != 0)
6629 Mods ^= SISrcMods::NEG;
6630
6631 if ((ModsTmp & SISrcMods::ABS) != 0)
6632 Mods |= SISrcMods::ABS;
6633 }
6634 };
6635
6636 CheckAbsNeg();
6637
6638 // op_sel/op_sel_hi decide the source type and source.
6639 // If the source's op_sel_hi is set, it indicates to do a conversion from
6640 // fp16. If the sources's op_sel is set, it picks the high half of the
6641 // source register.
6642
6643 Mods |= SISrcMods::OP_SEL_1;
6644
6645 if (isExtractHiElt(*MRI, Src, Src)) {
6646 Mods |= SISrcMods::OP_SEL_0;
6647 CheckAbsNeg();
6648 }
6649
6650 Matched = true;
6651 }
6652
6653 return {Src, Mods};
6654}
6655
6657AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
6658 MachineOperand &Root) const {
6659 Register Src;
6660 unsigned Mods;
6661 bool Matched;
6662 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6663 if (!Matched)
6664 return {};
6665
6666 return {{
6667 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6668 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6669 }};
6670}
6671
6673AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
6674 Register Src;
6675 unsigned Mods;
6676 bool Matched;
6677 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6678
6679 return {{
6680 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6681 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6682 }};
6683}
6684
6685bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
6686 MachineInstr &I, Intrinsic::ID IntrID) const {
6687 MachineBasicBlock *MBB = I.getParent();
6688 const DebugLoc &DL = I.getDebugLoc();
6689 Register CCReg = I.getOperand(0).getReg();
6690
6691 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6692 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
6693
6694 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
6695 .addImm(I.getOperand(2).getImm());
6696
6697 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
6698
6699 I.eraseFromParent();
6700 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
6701 *MRI);
6702}
6703
6704bool AMDGPUInstructionSelector::selectSGetBarrierState(
6705 MachineInstr &I, Intrinsic::ID IntrID) const {
6706 MachineBasicBlock *MBB = I.getParent();
6707 const DebugLoc &DL = I.getDebugLoc();
6708 MachineOperand BarOp = I.getOperand(2);
6709 std::optional<int64_t> BarValImm =
6710 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6711
6712 if (!BarValImm) {
6713 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6714 .addReg(BarOp.getReg());
6715 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6716 }
6718 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
6719 : AMDGPU::S_GET_BARRIER_STATE_M0;
6720 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6721
6722 auto DstReg = I.getOperand(0).getReg();
6723 const TargetRegisterClass *DstRC =
6724 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6725 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6726 return false;
6727 MIB.addDef(DstReg);
6728 if (BarValImm) {
6729 MIB.addImm(*BarValImm);
6730 }
6731 I.eraseFromParent();
6732 return true;
6733}
6734
6735unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
6736 if (HasInlineConst) {
6737 switch (IntrID) {
6738 default:
6739 llvm_unreachable("not a named barrier op");
6740 case Intrinsic::amdgcn_s_barrier_join:
6741 return AMDGPU::S_BARRIER_JOIN_IMM;
6742 case Intrinsic::amdgcn_s_get_named_barrier_state:
6743 return AMDGPU::S_GET_BARRIER_STATE_IMM;
6744 };
6745 } else {
6746 switch (IntrID) {
6747 default:
6748 llvm_unreachable("not a named barrier op");
6749 case Intrinsic::amdgcn_s_barrier_join:
6750 return AMDGPU::S_BARRIER_JOIN_M0;
6751 case Intrinsic::amdgcn_s_get_named_barrier_state:
6752 return AMDGPU::S_GET_BARRIER_STATE_M0;
6753 };
6754 }
6755}
6756
6757bool AMDGPUInstructionSelector::selectNamedBarrierInit(
6758 MachineInstr &I, Intrinsic::ID IntrID) const {
6759 MachineBasicBlock *MBB = I.getParent();
6760 const DebugLoc &DL = I.getDebugLoc();
6761 MachineOperand BarOp = I.getOperand(1);
6762 MachineOperand CntOp = I.getOperand(2);
6763
6764 // BarID = (BarOp >> 4) & 0x3F
6765 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6766 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6767 .add(BarOp)
6768 .addImm(4u)
6769 .setOperandDead(3); // Dead scc
6770
6771 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6772 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6773 .addReg(TmpReg0)
6774 .addImm(0x3F)
6775 .setOperandDead(3); // Dead scc
6776
6777 // MO = ((CntOp & 0x3F) << shAmt) | BarID
6778 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6779 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
6780 .add(CntOp)
6781 .addImm(0x3F)
6782 .setOperandDead(3); // Dead scc
6783
6784 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6785 constexpr unsigned ShAmt = 16;
6786 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
6787 .addReg(TmpReg2)
6788 .addImm(ShAmt)
6789 .setOperandDead(3); // Dead scc
6790
6791 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6792 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
6793 .addReg(TmpReg1)
6794 .addReg(TmpReg3)
6795 .setOperandDead(3); // Dead scc;
6796
6797 auto CopyMIB =
6798 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
6799 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6800
6801 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
6802 ? AMDGPU::S_BARRIER_INIT_M0
6803 : AMDGPU::S_BARRIER_SIGNAL_M0;
6805 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6806
6807 I.eraseFromParent();
6808 return true;
6809}
6810
6811bool AMDGPUInstructionSelector::selectNamedBarrierInst(
6812 MachineInstr &I, Intrinsic::ID IntrID) const {
6813 MachineBasicBlock *MBB = I.getParent();
6814 const DebugLoc &DL = I.getDebugLoc();
6815 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
6816 ? I.getOperand(2)
6817 : I.getOperand(1);
6818 std::optional<int64_t> BarValImm =
6819 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6820
6821 if (!BarValImm) {
6822 // BarID = (BarOp >> 4) & 0x3F
6823 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6824 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6825 .addReg(BarOp.getReg())
6826 .addImm(4u)
6827 .setOperandDead(3); // Dead scc;
6828
6829 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6830 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6831 .addReg(TmpReg0)
6832 .addImm(0x3F)
6833 .setOperandDead(3); // Dead scc;
6834
6835 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6836 .addReg(TmpReg1);
6837 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6838 }
6839
6841 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
6842 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6843
6844 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
6845 auto DstReg = I.getOperand(0).getReg();
6846 const TargetRegisterClass *DstRC =
6847 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6848 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6849 return false;
6850 MIB.addDef(DstReg);
6851 }
6852
6853 if (BarValImm) {
6854 auto BarId = ((*BarValImm) >> 4) & 0x3F;
6855 MIB.addImm(BarId);
6856 }
6857
6858 I.eraseFromParent();
6859 return true;
6860}
6861
6862void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
6863 const MachineInstr &MI,
6864 int OpIdx) const {
6865 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6866 "Expected G_CONSTANT");
6867 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
6868}
6869
6870void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
6871 const MachineInstr &MI,
6872 int OpIdx) const {
6873 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6874 "Expected G_CONSTANT");
6875 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
6876}
6877
6878void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
6879 const MachineInstr &MI,
6880 int OpIdx) const {
6881 const MachineOperand &Op = MI.getOperand(1);
6882 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
6883 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
6884}
6885
6886void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
6887 const MachineInstr &MI,
6888 int OpIdx) const {
6889 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6890 "Expected G_CONSTANT");
6891 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
6892}
6893
6894/// This only really exists to satisfy DAG type checking machinery, so is a
6895/// no-op here.
6896void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
6897 const MachineInstr &MI,
6898 int OpIdx) const {
6899 const MachineOperand &Op = MI.getOperand(OpIdx);
6900 int64_t Imm;
6901 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
6902 MIB.addImm(Imm);
6903 else
6904 MIB.addImm(Op.getImm());
6905}
6906
6907void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
6908 const MachineInstr &MI,
6909 int OpIdx) const {
6910 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
6911}
6912
6913void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
6914 const MachineInstr &MI,
6915 int OpIdx) const {
6916 assert(OpIdx >= 0 && "expected to match an immediate operand");
6917 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6918}
6919
6920void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
6921 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6922 assert(OpIdx >= 0 && "expected to match an immediate operand");
6923 MIB.addImm(
6924 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6925}
6926
6927void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
6928 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6929 assert(OpIdx >= 0 && "expected to match an immediate operand");
6930 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
6932 : (int64_t)SISrcMods::DST_OP_SEL);
6933}
6934
6935void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
6936 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6937 assert(OpIdx >= 0 && "expected to match an immediate operand");
6938 MIB.addImm(
6939 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6940}
6941
6942void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
6943 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6944 assert(OpIdx >= 0 && "expected to match an immediate operand");
6945 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
6946 ? (int64_t)(SISrcMods::OP_SEL_0)
6947 : 0);
6948}
6949
6950void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
6951 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6952 assert(OpIdx >= 0 && "expected to match an immediate operand");
6953 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
6954 : 0);
6955}
6956
6957void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
6958 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6959 assert(OpIdx >= 0 && "expected to match an immediate operand");
6960 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
6961 : 0);
6962}
6963
6964void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
6965 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6966 assert(OpIdx >= 0 && "expected to match an immediate operand");
6967 MIB.addImm(
6968 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6969}
6970
6971void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
6972 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6973 assert(OpIdx >= 0 && "expected to match an immediate operand");
6974 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
6975 ? (int64_t)SISrcMods::DST_OP_SEL
6976 : 0);
6977}
6978
6979void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
6980 const MachineInstr &MI,
6981 int OpIdx) const {
6982 assert(OpIdx >= 0 && "expected to match an immediate operand");
6983 MIB.addImm(MI.getOperand(OpIdx).getImm() &
6986}
6987
6988void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
6989 const MachineInstr &MI,
6990 int OpIdx) const {
6991 assert(OpIdx >= 0 && "expected to match an immediate operand");
6992 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
6995 MIB.addImm(Swizzle);
6996}
6997
6998void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
6999 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7000 assert(OpIdx >= 0 && "expected to match an immediate operand");
7001 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
7004 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
7005}
7006
7007void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7008 const MachineInstr &MI,
7009 int OpIdx) const {
7010 MIB.addFrameIndex(MI.getOperand(1).getIndex());
7011}
7012
7013void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7014 const MachineInstr &MI,
7015 int OpIdx) const {
7016 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
7017 int ExpVal = APF.getExactLog2Abs();
7018 assert(ExpVal != INT_MIN);
7019 MIB.addImm(ExpVal);
7020}
7021
7022void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7023 const MachineInstr &MI,
7024 int OpIdx) const {
7025 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7026 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7027 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7028 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7029 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
7030}
7031
7032void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7033 const MachineInstr &MI,
7034 int OpIdx) const {
7035 unsigned Mods = SISrcMods::OP_SEL_1;
7036 if (MI.getOperand(OpIdx).getImm())
7037 Mods ^= SISrcMods::NEG;
7038 MIB.addImm((int64_t)Mods);
7039}
7040
7041void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7042 const MachineInstr &MI,
7043 int OpIdx) const {
7044 unsigned Mods = SISrcMods::OP_SEL_1;
7045 if (MI.getOperand(OpIdx).getImm())
7047 MIB.addImm((int64_t)Mods);
7048}
7049
7050void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7051 const MachineInstr &MI,
7052 int OpIdx) const {
7053 unsigned Val = MI.getOperand(OpIdx).getImm();
7054 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7055 if (Val == 1) // neg
7056 Mods ^= SISrcMods::NEG;
7057 if (Val == 2) // abs
7058 Mods ^= SISrcMods::ABS;
7059 if (Val == 3) // neg and abs
7060 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7061 MIB.addImm((int64_t)Mods);
7062}
7063
7064void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7065 const MachineInstr &MI,
7066 int OpIdx) const {
7067 uint32_t V = MI.getOperand(2).getImm();
7070 if (!Subtarget->hasSafeCUPrefetch())
7071 V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7072 MIB.addImm(V);
7073}
7074
7075/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7076void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7077 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7078 unsigned Val = MI.getOperand(OpIdx).getImm();
7079 unsigned New = 0;
7080 if (Val & 0x1)
7082 if (Val & 0x2)
7084 MIB.addImm(New);
7085}
7086
7087bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7088 return TII.isInlineConstant(Imm);
7089}
7090
7091bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7092 return TII.isInlineConstant(Imm);
7093}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
@ IS_UPPER_HALF_NEG
@ IS_LOWER_HALF_NEG
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
constexpr LLT S1
constexpr LLT S32
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
uint64_t Addr
uint64_t Size
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
MachineInstr unsigned OpIdx
#define P(N)
if(PassOpts->AAPipeline)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
raw_pwrite_stream & OS
#define LLVM_DEBUG(...)
Definition: Debug.h:119
Value * RHS
Value * LHS
This is used to control valid status that current MI supports.
bool checkOptions(SrcStatus Stat) const
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getWavefrontSizeLog2() const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1497
Class for arbitrary precision integers.
Definition: APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:1012
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1562
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1656
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:681
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:695
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:707
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:708
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:684
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:693
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:682
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:683
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:702
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:701
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:705
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:692
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:686
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:689
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:703
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:690
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:685
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:687
@ ICMP_EQ
equal
Definition: InstrTypes.h:699
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:706
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:694
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:704
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:691
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:680
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:688
bool isFPPredicate() const
Definition: InstrTypes.h:784
bool isIntPredicate() const
Definition: InstrTypes.h:785
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:277
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:169
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:163
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:359
Represents a G_BUILD_VECTOR.
bool useVGPRIndexMode() const
bool hasPermlane32Swap() const
bool hasScalarCompareEq64() const
int getLDSBankCount() const
Definition: GCNSubtarget.h:379
bool hasSafeCUPrefetch() const
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:513
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:517
bool hasBitOp3Insts() const
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:678
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool hasScaleOffset() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:598
bool hasSignedScratchOffsets() const
bool hasVMemToLDSLoad() const
bool hasDOTOpSelHazard() const
bool hasRestrictedSOffset() const
bool hasMadU64U32NoCarry() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:316
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:757
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool hasG16() const
bool hasPermlane16Swap() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasSignedGVSOffset() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:576
Generation getGeneration() const
Definition: GCNSubtarget.h:356
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:787
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:771
bool hasAddr64() const
Definition: GCNSubtarget.h:424
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:779
bool hasSALUFloatInsts() const
bool hasPartialNSAEncoding() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
Represents a G_CONCAT_VECTORS.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
KnownBits getKnownBits(Register R)
APInt getKnownZeroes(Register R)
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:663
constexpr bool isScalar() const
Definition: LowLevelType.h:147
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:43
constexpr bool isValid() const
Definition: LowLevelType.h:146
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
Definition: LowLevelType.h:160
constexpr bool isVector() const
Definition: LowLevelType.h:149
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:191
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:278
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:271
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:101
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool hasValue() const
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:199
Metadata node.
Definition: Metadata.h:1077
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
void setReturnAddressIsTaken(bool s)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:72
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:587
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:359
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
Definition: MachineInstr.h:409
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:590
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:511
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:595
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Root of the metadata hierarchy.
Definition: Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
Analysis providing profile information.
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
TypeSize getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
This class implements the register bank concept.
Definition: RegisterBank.h:29
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:46
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCRegister getReturnAddressReg(const MachineFunction &MF) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
const TargetRegisterClass * getBoolRC() const
MCRegister getExec() const
const TargetRegisterClass * getWaveMaskRegClass() const
static bool isSGPRClass(const TargetRegisterClass *RC)
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:417
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
LLVM Value Representation.
Definition: Value.h:75
LLVM_ABI Value(Type *Ty, unsigned scid)
Definition: Value.cpp:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
Key
PAL metadata keys.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
IndexMode
ARM Index Modes.
Definition: ARMBaseInfo.h:177
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:751
operand_type_match m_Reg()
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
ImplicitDefMatch m_GImplicitDef()
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
SpecificConstantMatch m_SpecificICst(APInt RequestedValue)
Matches a constant equal to RequestedValue.
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:916
@ Offset
Definition: DWP.cpp:477
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition: Utils.cpp:56
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:651
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:307
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:459
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:294
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:155
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:293
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:492
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:342
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:314
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition: Utils.cpp:439
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:164
unsigned getUndefRegState(bool B)
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
@ DS_Error
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:433
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition: Utils.cpp:467
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:499
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:294
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:340
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.