LLVM 21.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
60 CodeGenCoverage *CoverageInfo,
62 BlockFrequencyInfo *BFI) {
63 MRI = &MF.getRegInfo();
64 Subtarget = &MF.getSubtarget<GCNSubtarget>();
67}
68
69// Return the wave level SGPR base address if this is a wave address.
71 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
72 ? Def->getOperand(1).getReg()
73 : Register();
74}
75
76bool AMDGPUInstructionSelector::isVCC(Register Reg,
77 const MachineRegisterInfo &MRI) const {
78 // The verifier is oblivious to s1 being a valid value for wavesize registers.
79 if (Reg.isPhysical())
80 return false;
81
82 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
83 const TargetRegisterClass *RC =
84 dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
85 if (RC) {
86 const LLT Ty = MRI.getType(Reg);
87 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
88 return false;
89 // G_TRUNC s1 result is never vcc.
90 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
91 RC->hasSuperClassEq(TRI.getBoolRC());
92 }
93
94 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
95 return RB->getID() == AMDGPU::VCCRegBankID;
96}
97
98bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
99 unsigned NewOpc) const {
100 MI.setDesc(TII.get(NewOpc));
101 MI.removeOperand(1); // Remove intrinsic ID.
102 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
103
104 MachineOperand &Dst = MI.getOperand(0);
105 MachineOperand &Src = MI.getOperand(1);
106
107 // TODO: This should be legalized to s32 if needed
108 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
109 return false;
110
111 const TargetRegisterClass *DstRC
112 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
113 const TargetRegisterClass *SrcRC
114 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
115 if (!DstRC || DstRC != SrcRC)
116 return false;
117
118 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
119 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
120}
121
122bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
123 const DebugLoc &DL = I.getDebugLoc();
124 MachineBasicBlock *BB = I.getParent();
125 I.setDesc(TII.get(TargetOpcode::COPY));
126
127 const MachineOperand &Src = I.getOperand(1);
128 MachineOperand &Dst = I.getOperand(0);
129 Register DstReg = Dst.getReg();
130 Register SrcReg = Src.getReg();
131
132 if (isVCC(DstReg, *MRI)) {
133 if (SrcReg == AMDGPU::SCC) {
134 const TargetRegisterClass *RC
135 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
136 if (!RC)
137 return true;
138 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
139 }
140
141 if (!isVCC(SrcReg, *MRI)) {
142 // TODO: Should probably leave the copy and let copyPhysReg expand it.
143 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
144 return false;
145
146 const TargetRegisterClass *SrcRC
147 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
148
149 std::optional<ValueAndVReg> ConstVal =
150 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
151 if (ConstVal) {
152 unsigned MovOpc =
153 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
154 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
155 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
156 } else {
157 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
158
159 // We can't trust the high bits at this point, so clear them.
160
161 // TODO: Skip masking high bits if def is known boolean.
162
163 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
164 assert(Subtarget->useRealTrue16Insts());
165 const int64_t NoMods = 0;
166 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
167 .addImm(NoMods)
168 .addImm(1)
169 .addImm(NoMods)
170 .addReg(SrcReg)
171 .addImm(NoMods);
172 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
173 .addImm(NoMods)
174 .addImm(0)
175 .addImm(NoMods)
176 .addReg(MaskedReg)
177 .addImm(NoMods);
178 } else {
179 bool IsSGPR = TRI.isSGPRClass(SrcRC);
180 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
181 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
182 .addImm(1)
183 .addReg(SrcReg);
184 if (IsSGPR)
185 And.setOperandDead(3); // Dead scc
186
187 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
188 .addImm(0)
189 .addReg(MaskedReg);
190 }
191 }
192
193 if (!MRI->getRegClassOrNull(SrcReg))
194 MRI->setRegClass(SrcReg, SrcRC);
195 I.eraseFromParent();
196 return true;
197 }
198
199 const TargetRegisterClass *RC =
201 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
202 return false;
203
204 return true;
205 }
206
207 for (const MachineOperand &MO : I.operands()) {
208 if (MO.getReg().isPhysical())
209 continue;
210
211 const TargetRegisterClass *RC =
213 if (!RC)
214 continue;
215 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
216 }
217 return true;
218}
219
220bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
221 const DebugLoc &DL = I.getDebugLoc();
222 MachineBasicBlock *BB = I.getParent();
223
224 unsigned CmpOpc =
225 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
226 MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
227 .addReg(I.getOperand(1).getReg())
228 .addImm(0);
229 if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
230 return false;
231
232 Register DstReg = I.getOperand(0).getReg();
233 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
234
235 I.eraseFromParent();
236 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
237}
238
239bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
240 const DebugLoc &DL = I.getDebugLoc();
241 MachineBasicBlock *BB = I.getParent();
242
243 Register DstReg = I.getOperand(0).getReg();
244 Register SrcReg = I.getOperand(1).getReg();
245 std::optional<ValueAndVReg> Arg =
246 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
247
248 if (Arg) {
249 const int64_t Value = Arg->Value.getZExtValue();
250 if (Value == 0) {
251 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
252 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
253 } else {
254 assert(Value == 1);
255 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
256 }
257 I.eraseFromParent();
258 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
259 }
260
261 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
262 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
263
264 unsigned SelectOpcode =
265 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
266 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
267 .addReg(TRI.getExec())
268 .addImm(0);
269
270 I.eraseFromParent();
271 return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
272}
273
274bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
275 Register DstReg = I.getOperand(0).getReg();
276 Register SrcReg = I.getOperand(1).getReg();
277
278 const DebugLoc &DL = I.getDebugLoc();
279 MachineBasicBlock *BB = I.getParent();
280
281 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
282 .addReg(SrcReg);
283
284 I.eraseFromParent();
285 return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
286}
287
288bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
289 const Register DefReg = I.getOperand(0).getReg();
290 const LLT DefTy = MRI->getType(DefReg);
291
292 // S1 G_PHIs should not be selected in instruction-select, instead:
293 // - divergent S1 G_PHI should go through lane mask merging algorithm
294 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
295 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
296 if (DefTy == LLT::scalar(1))
297 return false;
298
299 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
300
301 const RegClassOrRegBank &RegClassOrBank =
302 MRI->getRegClassOrRegBank(DefReg);
303
304 const TargetRegisterClass *DefRC =
305 dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
306 if (!DefRC) {
307 if (!DefTy.isValid()) {
308 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
309 return false;
310 }
311
312 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
313 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
314 if (!DefRC) {
315 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
316 return false;
317 }
318 }
319
320 // If inputs have register bank, assign corresponding reg class.
321 // Note: registers don't need to have the same reg bank.
322 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
323 const Register SrcReg = I.getOperand(i).getReg();
324
325 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
326 if (RB) {
327 const LLT SrcTy = MRI->getType(SrcReg);
328 const TargetRegisterClass *SrcRC =
329 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
330 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
331 return false;
332 }
333 }
334
335 I.setDesc(TII.get(TargetOpcode::PHI));
336 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
337}
338
340AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
341 const TargetRegisterClass &SubRC,
342 unsigned SubIdx) const {
343
344 MachineInstr *MI = MO.getParent();
346 Register DstReg = MRI->createVirtualRegister(&SubRC);
347
348 if (MO.isReg()) {
349 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
350 Register Reg = MO.getReg();
351 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
352 .addReg(Reg, 0, ComposedSubIdx);
353
354 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
355 MO.isKill(), MO.isDead(), MO.isUndef(),
356 MO.isEarlyClobber(), 0, MO.isDebug(),
357 MO.isInternalRead());
358 }
359
360 assert(MO.isImm());
361
362 APInt Imm(64, MO.getImm());
363
364 switch (SubIdx) {
365 default:
366 llvm_unreachable("do not know to split immediate with this sub index.");
367 case AMDGPU::sub0:
368 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
369 case AMDGPU::sub1:
370 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
371 }
372}
373
374static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
375 switch (Opc) {
376 case AMDGPU::G_AND:
377 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
378 case AMDGPU::G_OR:
379 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
380 case AMDGPU::G_XOR:
381 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
382 default:
383 llvm_unreachable("not a bit op");
384 }
385}
386
387bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
388 Register DstReg = I.getOperand(0).getReg();
389 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
390
391 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
392 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
393 DstRB->getID() != AMDGPU::VCCRegBankID)
394 return false;
395
396 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
397 STI.isWave64());
398 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
399
400 // Dead implicit-def of scc
401 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
402 true, // isImp
403 false, // isKill
404 true)); // isDead
405 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
406}
407
408bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
409 MachineBasicBlock *BB = I.getParent();
411 Register DstReg = I.getOperand(0).getReg();
412 const DebugLoc &DL = I.getDebugLoc();
413 LLT Ty = MRI->getType(DstReg);
414 if (Ty.isVector())
415 return false;
416
417 unsigned Size = Ty.getSizeInBits();
418 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
419 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
420 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
421
422 if (Size == 32) {
423 if (IsSALU) {
424 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
426 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
427 .add(I.getOperand(1))
428 .add(I.getOperand(2))
429 .setOperandDead(3); // Dead scc
430 I.eraseFromParent();
431 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
432 }
433
434 if (STI.hasAddNoCarry()) {
435 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
436 I.setDesc(TII.get(Opc));
437 I.addOperand(*MF, MachineOperand::CreateImm(0));
438 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
439 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
440 }
441
442 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
443
444 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
446 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
447 .addDef(UnusedCarry, RegState::Dead)
448 .add(I.getOperand(1))
449 .add(I.getOperand(2))
450 .addImm(0);
451 I.eraseFromParent();
452 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
453 }
454
455 assert(!Sub && "illegal sub should not reach here");
456
457 const TargetRegisterClass &RC
458 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
459 const TargetRegisterClass &HalfRC
460 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
461
462 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
463 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
464 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
465 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
466
467 Register DstLo = MRI->createVirtualRegister(&HalfRC);
468 Register DstHi = MRI->createVirtualRegister(&HalfRC);
469
470 if (IsSALU) {
471 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
472 .add(Lo1)
473 .add(Lo2);
474 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
475 .add(Hi1)
476 .add(Hi2)
477 .setOperandDead(3); // Dead scc
478 } else {
479 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
480 Register CarryReg = MRI->createVirtualRegister(CarryRC);
481 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
482 .addDef(CarryReg)
483 .add(Lo1)
484 .add(Lo2)
485 .addImm(0);
486 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
487 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
488 .add(Hi1)
489 .add(Hi2)
490 .addReg(CarryReg, RegState::Kill)
491 .addImm(0);
492
493 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
494 return false;
495 }
496
497 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
498 .addReg(DstLo)
499 .addImm(AMDGPU::sub0)
500 .addReg(DstHi)
501 .addImm(AMDGPU::sub1);
502
503
504 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
505 return false;
506
507 I.eraseFromParent();
508 return true;
509}
510
511bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
512 MachineInstr &I) const {
513 MachineBasicBlock *BB = I.getParent();
515 const DebugLoc &DL = I.getDebugLoc();
516 Register Dst0Reg = I.getOperand(0).getReg();
517 Register Dst1Reg = I.getOperand(1).getReg();
518 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
519 I.getOpcode() == AMDGPU::G_UADDE;
520 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
521 I.getOpcode() == AMDGPU::G_USUBE;
522
523 if (isVCC(Dst1Reg, *MRI)) {
524 unsigned NoCarryOpc =
525 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
526 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
527 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
528 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
529 I.addOperand(*MF, MachineOperand::CreateImm(0));
530 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
531 }
532
533 Register Src0Reg = I.getOperand(2).getReg();
534 Register Src1Reg = I.getOperand(3).getReg();
535
536 if (HasCarryIn) {
537 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
538 .addReg(I.getOperand(4).getReg());
539 }
540
541 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
542 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
543
544 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
545 .add(I.getOperand(2))
546 .add(I.getOperand(3));
547
548 if (MRI->use_nodbg_empty(Dst1Reg)) {
549 CarryInst.setOperandDead(3); // Dead scc
550 } else {
551 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
552 .addReg(AMDGPU::SCC);
553 if (!MRI->getRegClassOrNull(Dst1Reg))
554 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
555 }
556
557 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
558 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
559 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
560 return false;
561
562 if (HasCarryIn &&
563 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
564 AMDGPU::SReg_32RegClass, *MRI))
565 return false;
566
567 I.eraseFromParent();
568 return true;
569}
570
571bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
572 MachineInstr &I) const {
573 MachineBasicBlock *BB = I.getParent();
575 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
576
577 unsigned Opc;
578 if (Subtarget->hasMADIntraFwdBug())
579 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
580 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
581 else
582 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
583 I.setDesc(TII.get(Opc));
584 I.addOperand(*MF, MachineOperand::CreateImm(0));
585 I.addImplicitDefUseOperands(*MF);
586 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
587}
588
589// TODO: We should probably legalize these to only using 32-bit results.
590bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
591 MachineBasicBlock *BB = I.getParent();
592 Register DstReg = I.getOperand(0).getReg();
593 Register SrcReg = I.getOperand(1).getReg();
594 LLT DstTy = MRI->getType(DstReg);
595 LLT SrcTy = MRI->getType(SrcReg);
596 const unsigned SrcSize = SrcTy.getSizeInBits();
597 unsigned DstSize = DstTy.getSizeInBits();
598
599 // TODO: Should handle any multiple of 32 offset.
600 unsigned Offset = I.getOperand(2).getImm();
601 if (Offset % 32 != 0 || DstSize > 128)
602 return false;
603
604 // 16-bit operations really use 32-bit registers.
605 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
606 if (DstSize == 16)
607 DstSize = 32;
608
609 const TargetRegisterClass *DstRC =
610 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
611 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
612 return false;
613
614 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
615 const TargetRegisterClass *SrcRC =
616 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
617 if (!SrcRC)
618 return false;
620 DstSize / 32);
621 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
622 if (!SrcRC)
623 return false;
624
625 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
626 *SrcRC, I.getOperand(1));
627 const DebugLoc &DL = I.getDebugLoc();
628 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
629 .addReg(SrcReg, 0, SubReg);
630
631 I.eraseFromParent();
632 return true;
633}
634
635bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
636 MachineBasicBlock *BB = MI.getParent();
637 Register DstReg = MI.getOperand(0).getReg();
638 LLT DstTy = MRI->getType(DstReg);
639 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
640
641 const unsigned SrcSize = SrcTy.getSizeInBits();
642 if (SrcSize < 32)
643 return selectImpl(MI, *CoverageInfo);
644
645 const DebugLoc &DL = MI.getDebugLoc();
646 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
647 const unsigned DstSize = DstTy.getSizeInBits();
648 const TargetRegisterClass *DstRC =
649 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
650 if (!DstRC)
651 return false;
652
653 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
655 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
656 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
657 MachineOperand &Src = MI.getOperand(I + 1);
658 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
659 MIB.addImm(SubRegs[I]);
660
661 const TargetRegisterClass *SrcRC
662 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
663 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
664 return false;
665 }
666
667 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
668 return false;
669
670 MI.eraseFromParent();
671 return true;
672}
673
674bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
675 MachineBasicBlock *BB = MI.getParent();
676 const int NumDst = MI.getNumOperands() - 1;
677
678 MachineOperand &Src = MI.getOperand(NumDst);
679
680 Register SrcReg = Src.getReg();
681 Register DstReg0 = MI.getOperand(0).getReg();
682 LLT DstTy = MRI->getType(DstReg0);
683 LLT SrcTy = MRI->getType(SrcReg);
684
685 const unsigned DstSize = DstTy.getSizeInBits();
686 const unsigned SrcSize = SrcTy.getSizeInBits();
687 const DebugLoc &DL = MI.getDebugLoc();
688 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
689
690 const TargetRegisterClass *SrcRC =
691 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
692 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
693 return false;
694
695 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
696 // source, and this relies on the fact that the same subregister indices are
697 // used for both.
698 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
699 for (int I = 0, E = NumDst; I != E; ++I) {
700 MachineOperand &Dst = MI.getOperand(I);
701 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
702 .addReg(SrcReg, 0, SubRegs[I]);
703
704 // Make sure the subregister index is valid for the source register.
705 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
706 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
707 return false;
708
709 const TargetRegisterClass *DstRC =
711 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
712 return false;
713 }
714
715 MI.eraseFromParent();
716 return true;
717}
718
719bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
720 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
721 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
722
723 Register Src0 = MI.getOperand(1).getReg();
724 Register Src1 = MI.getOperand(2).getReg();
725 LLT SrcTy = MRI->getType(Src0);
726 const unsigned SrcSize = SrcTy.getSizeInBits();
727
728 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
729 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
730 return selectG_MERGE_VALUES(MI);
731 }
732
733 // Selection logic below is for V2S16 only.
734 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
735 Register Dst = MI.getOperand(0).getReg();
736 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
737 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
738 SrcTy != LLT::scalar(32)))
739 return selectImpl(MI, *CoverageInfo);
740
741 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
742 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
743 return false;
744
745 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
746 DstBank->getID() == AMDGPU::VGPRRegBankID);
747 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
748
749 const DebugLoc &DL = MI.getDebugLoc();
750 MachineBasicBlock *BB = MI.getParent();
751
752 // First, before trying TableGen patterns, check if both sources are
753 // constants. In those cases, we can trivially compute the final constant
754 // and emit a simple move.
755 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
756 if (ConstSrc1) {
757 auto ConstSrc0 =
758 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
759 if (ConstSrc0) {
760 const int64_t K0 = ConstSrc0->Value.getSExtValue();
761 const int64_t K1 = ConstSrc1->Value.getSExtValue();
762 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
763 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
764 uint32_t Imm = Lo16 | (Hi16 << 16);
765
766 // VALU
767 if (IsVector) {
768 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
769 MI.eraseFromParent();
770 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
771 }
772
773 // SALU
774 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
775 MI.eraseFromParent();
776 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
777 }
778 }
779
780 // Now try TableGen patterns.
781 if (selectImpl(MI, *CoverageInfo))
782 return true;
783
784 // TODO: This should probably be a combine somewhere
785 // (build_vector $src0, undef) -> copy $src0
786 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
787 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
788 MI.setDesc(TII.get(AMDGPU::COPY));
789 MI.removeOperand(2);
790 const auto &RC =
791 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
792 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
793 RBI.constrainGenericRegister(Src0, RC, *MRI);
794 }
795
796 // TODO: Can be improved?
797 if (IsVector) {
798 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
799 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
800 .addImm(0xFFFF)
801 .addReg(Src0);
802 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
803 return false;
804
805 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
806 .addReg(Src1)
807 .addImm(16)
808 .addReg(TmpReg);
809 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
810 return false;
811
812 MI.eraseFromParent();
813 return true;
814 }
815
816 Register ShiftSrc0;
817 Register ShiftSrc1;
818
819 // With multiple uses of the shift, this will duplicate the shift and
820 // increase register pressure.
821 //
822 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
823 // => (S_PACK_HH_B32_B16 $src0, $src1)
824 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
825 // => (S_PACK_HL_B32_B16 $src0, $src1)
826 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
827 // => (S_PACK_LH_B32_B16 $src0, $src1)
828 // (build_vector $src0, $src1)
829 // => (S_PACK_LL_B32_B16 $src0, $src1)
830
831 bool Shift0 = mi_match(
832 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
833
834 bool Shift1 = mi_match(
835 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
836
837 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
838 if (Shift0 && Shift1) {
839 Opc = AMDGPU::S_PACK_HH_B32_B16;
840 MI.getOperand(1).setReg(ShiftSrc0);
841 MI.getOperand(2).setReg(ShiftSrc1);
842 } else if (Shift1) {
843 Opc = AMDGPU::S_PACK_LH_B32_B16;
844 MI.getOperand(2).setReg(ShiftSrc1);
845 } else if (Shift0) {
846 auto ConstSrc1 =
847 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
848 if (ConstSrc1 && ConstSrc1->Value == 0) {
849 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
850 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
851 .addReg(ShiftSrc0)
852 .addImm(16)
853 .setOperandDead(3); // Dead scc
854
855 MI.eraseFromParent();
856 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
857 }
858 if (STI.hasSPackHL()) {
859 Opc = AMDGPU::S_PACK_HL_B32_B16;
860 MI.getOperand(1).setReg(ShiftSrc0);
861 }
862 }
863
864 MI.setDesc(TII.get(Opc));
865 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
866}
867
868bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
869 const MachineOperand &MO = I.getOperand(0);
870
871 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
872 // regbank check here is to know why getConstrainedRegClassForOperand failed.
874 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
875 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
876 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
877 return true;
878 }
879
880 return false;
881}
882
883bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
884 MachineBasicBlock *BB = I.getParent();
885
886 Register DstReg = I.getOperand(0).getReg();
887 Register Src0Reg = I.getOperand(1).getReg();
888 Register Src1Reg = I.getOperand(2).getReg();
889 LLT Src1Ty = MRI->getType(Src1Reg);
890
891 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
892 unsigned InsSize = Src1Ty.getSizeInBits();
893
894 int64_t Offset = I.getOperand(3).getImm();
895
896 // FIXME: These cases should have been illegal and unnecessary to check here.
897 if (Offset % 32 != 0 || InsSize % 32 != 0)
898 return false;
899
900 // Currently not handled by getSubRegFromChannel.
901 if (InsSize > 128)
902 return false;
903
904 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
905 if (SubReg == AMDGPU::NoSubRegister)
906 return false;
907
908 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
909 const TargetRegisterClass *DstRC =
910 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
911 if (!DstRC)
912 return false;
913
914 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
915 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
916 const TargetRegisterClass *Src0RC =
917 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
918 const TargetRegisterClass *Src1RC =
919 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
920
921 // Deal with weird cases where the class only partially supports the subreg
922 // index.
923 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
924 if (!Src0RC || !Src1RC)
925 return false;
926
927 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
928 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
929 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
930 return false;
931
932 const DebugLoc &DL = I.getDebugLoc();
933 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
934 .addReg(Src0Reg)
935 .addReg(Src1Reg)
936 .addImm(SubReg);
937
938 I.eraseFromParent();
939 return true;
940}
941
942bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
943 Register DstReg = MI.getOperand(0).getReg();
944 Register SrcReg = MI.getOperand(1).getReg();
945 Register OffsetReg = MI.getOperand(2).getReg();
946 Register WidthReg = MI.getOperand(3).getReg();
947
948 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
949 "scalar BFX instructions are expanded in regbankselect");
950 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
951 "64-bit vector BFX instructions are expanded in regbankselect");
952
953 const DebugLoc &DL = MI.getDebugLoc();
954 MachineBasicBlock *MBB = MI.getParent();
955
956 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
957 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
958 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
959 .addReg(SrcReg)
960 .addReg(OffsetReg)
961 .addReg(WidthReg);
962 MI.eraseFromParent();
963 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
964}
965
966bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
967 if (STI.getLDSBankCount() != 16)
968 return selectImpl(MI, *CoverageInfo);
969
970 Register Dst = MI.getOperand(0).getReg();
971 Register Src0 = MI.getOperand(2).getReg();
972 Register M0Val = MI.getOperand(6).getReg();
973 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
974 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
975 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
976 return false;
977
978 // This requires 2 instructions. It is possible to write a pattern to support
979 // this, but the generated isel emitter doesn't correctly deal with multiple
980 // output instructions using the same physical register input. The copy to m0
981 // is incorrectly placed before the second instruction.
982 //
983 // TODO: Match source modifiers.
984
985 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
986 const DebugLoc &DL = MI.getDebugLoc();
987 MachineBasicBlock *MBB = MI.getParent();
988
989 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
990 .addReg(M0Val);
991 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
992 .addImm(2)
993 .addImm(MI.getOperand(4).getImm()) // $attr
994 .addImm(MI.getOperand(3).getImm()); // $attrchan
995
996 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
997 .addImm(0) // $src0_modifiers
998 .addReg(Src0) // $src0
999 .addImm(MI.getOperand(4).getImm()) // $attr
1000 .addImm(MI.getOperand(3).getImm()) // $attrchan
1001 .addImm(0) // $src2_modifiers
1002 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1003 .addImm(MI.getOperand(5).getImm()) // $high
1004 .addImm(0) // $clamp
1005 .addImm(0); // $omod
1006
1007 MI.eraseFromParent();
1008 return true;
1009}
1010
1011// Writelane is special in that it can use SGPR and M0 (which would normally
1012// count as using the constant bus twice - but in this case it is allowed since
1013// the lane selector doesn't count as a use of the constant bus). However, it is
1014// still required to abide by the 1 SGPR rule. Fix this up if we might have
1015// multiple SGPRs.
1016bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1017 // With a constant bus limit of at least 2, there's no issue.
1018 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1019 return selectImpl(MI, *CoverageInfo);
1020
1021 MachineBasicBlock *MBB = MI.getParent();
1022 const DebugLoc &DL = MI.getDebugLoc();
1023 Register VDst = MI.getOperand(0).getReg();
1024 Register Val = MI.getOperand(2).getReg();
1025 Register LaneSelect = MI.getOperand(3).getReg();
1026 Register VDstIn = MI.getOperand(4).getReg();
1027
1028 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1029
1030 std::optional<ValueAndVReg> ConstSelect =
1031 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1032 if (ConstSelect) {
1033 // The selector has to be an inline immediate, so we can use whatever for
1034 // the other operands.
1035 MIB.addReg(Val);
1036 MIB.addImm(ConstSelect->Value.getSExtValue() &
1037 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1038 } else {
1039 std::optional<ValueAndVReg> ConstVal =
1041
1042 // If the value written is an inline immediate, we can get away without a
1043 // copy to m0.
1044 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1045 STI.hasInv2PiInlineImm())) {
1046 MIB.addImm(ConstVal->Value.getSExtValue());
1047 MIB.addReg(LaneSelect);
1048 } else {
1049 MIB.addReg(Val);
1050
1051 // If the lane selector was originally in a VGPR and copied with
1052 // readfirstlane, there's a hazard to read the same SGPR from the
1053 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1054 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1055
1056 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1057 .addReg(LaneSelect);
1058 MIB.addReg(AMDGPU::M0);
1059 }
1060 }
1061
1062 MIB.addReg(VDstIn);
1063
1064 MI.eraseFromParent();
1065 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1066}
1067
1068// We need to handle this here because tablegen doesn't support matching
1069// instructions with multiple outputs.
1070bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1071 Register Dst0 = MI.getOperand(0).getReg();
1072 Register Dst1 = MI.getOperand(1).getReg();
1073
1074 LLT Ty = MRI->getType(Dst0);
1075 unsigned Opc;
1076 if (Ty == LLT::scalar(32))
1077 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1078 else if (Ty == LLT::scalar(64))
1079 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1080 else
1081 return false;
1082
1083 // TODO: Match source modifiers.
1084
1085 const DebugLoc &DL = MI.getDebugLoc();
1086 MachineBasicBlock *MBB = MI.getParent();
1087
1088 Register Numer = MI.getOperand(3).getReg();
1089 Register Denom = MI.getOperand(4).getReg();
1090 unsigned ChooseDenom = MI.getOperand(5).getImm();
1091
1092 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1093
1094 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1095 .addDef(Dst1)
1096 .addImm(0) // $src0_modifiers
1097 .addUse(Src0) // $src0
1098 .addImm(0) // $src1_modifiers
1099 .addUse(Denom) // $src1
1100 .addImm(0) // $src2_modifiers
1101 .addUse(Numer) // $src2
1102 .addImm(0) // $clamp
1103 .addImm(0); // $omod
1104
1105 MI.eraseFromParent();
1106 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1107}
1108
1109bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1110 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1111 switch (IntrinsicID) {
1112 case Intrinsic::amdgcn_if_break: {
1113 MachineBasicBlock *BB = I.getParent();
1114
1115 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1116 // SelectionDAG uses for wave32 vs wave64.
1117 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1118 .add(I.getOperand(0))
1119 .add(I.getOperand(2))
1120 .add(I.getOperand(3));
1121
1122 Register DstReg = I.getOperand(0).getReg();
1123 Register Src0Reg = I.getOperand(2).getReg();
1124 Register Src1Reg = I.getOperand(3).getReg();
1125
1126 I.eraseFromParent();
1127
1128 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1129 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1130
1131 return true;
1132 }
1133 case Intrinsic::amdgcn_interp_p1_f16:
1134 return selectInterpP1F16(I);
1135 case Intrinsic::amdgcn_wqm:
1136 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1137 case Intrinsic::amdgcn_softwqm:
1138 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1139 case Intrinsic::amdgcn_strict_wwm:
1140 case Intrinsic::amdgcn_wwm:
1141 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1142 case Intrinsic::amdgcn_strict_wqm:
1143 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1144 case Intrinsic::amdgcn_writelane:
1145 return selectWritelane(I);
1146 case Intrinsic::amdgcn_div_scale:
1147 return selectDivScale(I);
1148 case Intrinsic::amdgcn_icmp:
1149 case Intrinsic::amdgcn_fcmp:
1150 if (selectImpl(I, *CoverageInfo))
1151 return true;
1152 return selectIntrinsicCmp(I);
1153 case Intrinsic::amdgcn_ballot:
1154 return selectBallot(I);
1155 case Intrinsic::amdgcn_reloc_constant:
1156 return selectRelocConstant(I);
1157 case Intrinsic::amdgcn_groupstaticsize:
1158 return selectGroupStaticSize(I);
1159 case Intrinsic::returnaddress:
1160 return selectReturnAddress(I);
1161 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1162 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1163 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1164 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1165 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1166 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1167 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1168 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1169 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1170 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1171 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1172 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1173 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1174 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1175 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1176 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1177 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1178 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1179 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1180 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1181 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1182 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1183 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1184 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1185 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1186 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1187 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1188 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1189 return selectSMFMACIntrin(I);
1190 case Intrinsic::amdgcn_permlane16_swap:
1191 case Intrinsic::amdgcn_permlane32_swap:
1192 return selectPermlaneSwapIntrin(I, IntrinsicID);
1193 default:
1194 return selectImpl(I, *CoverageInfo);
1195 }
1196}
1197
1199 const GCNSubtarget &ST) {
1200 if (Size != 16 && Size != 32 && Size != 64)
1201 return -1;
1202
1203 if (Size == 16 && !ST.has16BitInsts())
1204 return -1;
1205
1206 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1207 unsigned FakeS16Opc, unsigned S32Opc,
1208 unsigned S64Opc) {
1209 if (Size == 16)
1210 return ST.hasTrue16BitInsts()
1211 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1212 : S16Opc;
1213 if (Size == 32)
1214 return S32Opc;
1215 return S64Opc;
1216 };
1217
1218 switch (P) {
1219 default:
1220 llvm_unreachable("Unknown condition code!");
1221 case CmpInst::ICMP_NE:
1222 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1223 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1224 AMDGPU::V_CMP_NE_U64_e64);
1225 case CmpInst::ICMP_EQ:
1226 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1227 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1228 AMDGPU::V_CMP_EQ_U64_e64);
1229 case CmpInst::ICMP_SGT:
1230 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1231 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1232 AMDGPU::V_CMP_GT_I64_e64);
1233 case CmpInst::ICMP_SGE:
1234 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1235 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1236 AMDGPU::V_CMP_GE_I64_e64);
1237 case CmpInst::ICMP_SLT:
1238 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1239 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1240 AMDGPU::V_CMP_LT_I64_e64);
1241 case CmpInst::ICMP_SLE:
1242 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1243 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1244 AMDGPU::V_CMP_LE_I64_e64);
1245 case CmpInst::ICMP_UGT:
1246 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1247 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1248 AMDGPU::V_CMP_GT_U64_e64);
1249 case CmpInst::ICMP_UGE:
1250 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1251 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1252 AMDGPU::V_CMP_GE_U64_e64);
1253 case CmpInst::ICMP_ULT:
1254 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1255 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1256 AMDGPU::V_CMP_LT_U64_e64);
1257 case CmpInst::ICMP_ULE:
1258 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1259 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1260 AMDGPU::V_CMP_LE_U64_e64);
1261
1262 case CmpInst::FCMP_OEQ:
1263 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1264 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1265 AMDGPU::V_CMP_EQ_F64_e64);
1266 case CmpInst::FCMP_OGT:
1267 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1268 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1269 AMDGPU::V_CMP_GT_F64_e64);
1270 case CmpInst::FCMP_OGE:
1271 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1272 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1273 AMDGPU::V_CMP_GE_F64_e64);
1274 case CmpInst::FCMP_OLT:
1275 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1276 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1277 AMDGPU::V_CMP_LT_F64_e64);
1278 case CmpInst::FCMP_OLE:
1279 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1280 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1281 AMDGPU::V_CMP_LE_F64_e64);
1282 case CmpInst::FCMP_ONE:
1283 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1284 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1285 AMDGPU::V_CMP_NEQ_F64_e64);
1286 case CmpInst::FCMP_ORD:
1287 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1288 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1289 AMDGPU::V_CMP_O_F64_e64);
1290 case CmpInst::FCMP_UNO:
1291 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1292 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1293 AMDGPU::V_CMP_U_F64_e64);
1294 case CmpInst::FCMP_UEQ:
1295 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1296 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1297 AMDGPU::V_CMP_NLG_F64_e64);
1298 case CmpInst::FCMP_UGT:
1299 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1300 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1301 AMDGPU::V_CMP_NLE_F64_e64);
1302 case CmpInst::FCMP_UGE:
1303 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1304 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1305 AMDGPU::V_CMP_NLT_F64_e64);
1306 case CmpInst::FCMP_ULT:
1307 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1308 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1309 AMDGPU::V_CMP_NGE_F64_e64);
1310 case CmpInst::FCMP_ULE:
1311 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1312 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1313 AMDGPU::V_CMP_NGT_F64_e64);
1314 case CmpInst::FCMP_UNE:
1315 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1316 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1317 AMDGPU::V_CMP_NEQ_F64_e64);
1318 case CmpInst::FCMP_TRUE:
1319 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1320 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1321 AMDGPU::V_CMP_TRU_F64_e64);
1323 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1324 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1325 AMDGPU::V_CMP_F_F64_e64);
1326 }
1327}
1328
1329int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1330 unsigned Size) const {
1331 if (Size == 64) {
1332 if (!STI.hasScalarCompareEq64())
1333 return -1;
1334
1335 switch (P) {
1336 case CmpInst::ICMP_NE:
1337 return AMDGPU::S_CMP_LG_U64;
1338 case CmpInst::ICMP_EQ:
1339 return AMDGPU::S_CMP_EQ_U64;
1340 default:
1341 return -1;
1342 }
1343 }
1344
1345 if (Size == 32) {
1346 switch (P) {
1347 case CmpInst::ICMP_NE:
1348 return AMDGPU::S_CMP_LG_U32;
1349 case CmpInst::ICMP_EQ:
1350 return AMDGPU::S_CMP_EQ_U32;
1351 case CmpInst::ICMP_SGT:
1352 return AMDGPU::S_CMP_GT_I32;
1353 case CmpInst::ICMP_SGE:
1354 return AMDGPU::S_CMP_GE_I32;
1355 case CmpInst::ICMP_SLT:
1356 return AMDGPU::S_CMP_LT_I32;
1357 case CmpInst::ICMP_SLE:
1358 return AMDGPU::S_CMP_LE_I32;
1359 case CmpInst::ICMP_UGT:
1360 return AMDGPU::S_CMP_GT_U32;
1361 case CmpInst::ICMP_UGE:
1362 return AMDGPU::S_CMP_GE_U32;
1363 case CmpInst::ICMP_ULT:
1364 return AMDGPU::S_CMP_LT_U32;
1365 case CmpInst::ICMP_ULE:
1366 return AMDGPU::S_CMP_LE_U32;
1367 case CmpInst::FCMP_OEQ:
1368 return AMDGPU::S_CMP_EQ_F32;
1369 case CmpInst::FCMP_OGT:
1370 return AMDGPU::S_CMP_GT_F32;
1371 case CmpInst::FCMP_OGE:
1372 return AMDGPU::S_CMP_GE_F32;
1373 case CmpInst::FCMP_OLT:
1374 return AMDGPU::S_CMP_LT_F32;
1375 case CmpInst::FCMP_OLE:
1376 return AMDGPU::S_CMP_LE_F32;
1377 case CmpInst::FCMP_ONE:
1378 return AMDGPU::S_CMP_LG_F32;
1379 case CmpInst::FCMP_ORD:
1380 return AMDGPU::S_CMP_O_F32;
1381 case CmpInst::FCMP_UNO:
1382 return AMDGPU::S_CMP_U_F32;
1383 case CmpInst::FCMP_UEQ:
1384 return AMDGPU::S_CMP_NLG_F32;
1385 case CmpInst::FCMP_UGT:
1386 return AMDGPU::S_CMP_NLE_F32;
1387 case CmpInst::FCMP_UGE:
1388 return AMDGPU::S_CMP_NLT_F32;
1389 case CmpInst::FCMP_ULT:
1390 return AMDGPU::S_CMP_NGE_F32;
1391 case CmpInst::FCMP_ULE:
1392 return AMDGPU::S_CMP_NGT_F32;
1393 case CmpInst::FCMP_UNE:
1394 return AMDGPU::S_CMP_NEQ_F32;
1395 default:
1396 llvm_unreachable("Unknown condition code!");
1397 }
1398 }
1399
1400 if (Size == 16) {
1401 if (!STI.hasSALUFloatInsts())
1402 return -1;
1403
1404 switch (P) {
1405 case CmpInst::FCMP_OEQ:
1406 return AMDGPU::S_CMP_EQ_F16;
1407 case CmpInst::FCMP_OGT:
1408 return AMDGPU::S_CMP_GT_F16;
1409 case CmpInst::FCMP_OGE:
1410 return AMDGPU::S_CMP_GE_F16;
1411 case CmpInst::FCMP_OLT:
1412 return AMDGPU::S_CMP_LT_F16;
1413 case CmpInst::FCMP_OLE:
1414 return AMDGPU::S_CMP_LE_F16;
1415 case CmpInst::FCMP_ONE:
1416 return AMDGPU::S_CMP_LG_F16;
1417 case CmpInst::FCMP_ORD:
1418 return AMDGPU::S_CMP_O_F16;
1419 case CmpInst::FCMP_UNO:
1420 return AMDGPU::S_CMP_U_F16;
1421 case CmpInst::FCMP_UEQ:
1422 return AMDGPU::S_CMP_NLG_F16;
1423 case CmpInst::FCMP_UGT:
1424 return AMDGPU::S_CMP_NLE_F16;
1425 case CmpInst::FCMP_UGE:
1426 return AMDGPU::S_CMP_NLT_F16;
1427 case CmpInst::FCMP_ULT:
1428 return AMDGPU::S_CMP_NGE_F16;
1429 case CmpInst::FCMP_ULE:
1430 return AMDGPU::S_CMP_NGT_F16;
1431 case CmpInst::FCMP_UNE:
1432 return AMDGPU::S_CMP_NEQ_F16;
1433 default:
1434 llvm_unreachable("Unknown condition code!");
1435 }
1436 }
1437
1438 return -1;
1439}
1440
1441bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1442
1443 MachineBasicBlock *BB = I.getParent();
1444 const DebugLoc &DL = I.getDebugLoc();
1445
1446 Register SrcReg = I.getOperand(2).getReg();
1447 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1448
1449 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1450
1451 Register CCReg = I.getOperand(0).getReg();
1452 if (!isVCC(CCReg, *MRI)) {
1453 int Opcode = getS_CMPOpcode(Pred, Size);
1454 if (Opcode == -1)
1455 return false;
1456 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1457 .add(I.getOperand(2))
1458 .add(I.getOperand(3));
1459 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1460 .addReg(AMDGPU::SCC);
1461 bool Ret =
1462 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1463 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1464 I.eraseFromParent();
1465 return Ret;
1466 }
1467
1468 if (I.getOpcode() == AMDGPU::G_FCMP)
1469 return false;
1470
1471 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1472 if (Opcode == -1)
1473 return false;
1474
1475 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1476 I.getOperand(0).getReg())
1477 .add(I.getOperand(2))
1478 .add(I.getOperand(3));
1480 *TRI.getBoolRC(), *MRI);
1481 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1482 I.eraseFromParent();
1483 return Ret;
1484}
1485
1486bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1487 Register Dst = I.getOperand(0).getReg();
1488 if (isVCC(Dst, *MRI))
1489 return false;
1490
1491 LLT DstTy = MRI->getType(Dst);
1492 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1493 return false;
1494
1495 MachineBasicBlock *BB = I.getParent();
1496 const DebugLoc &DL = I.getDebugLoc();
1497 Register SrcReg = I.getOperand(2).getReg();
1498 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1499
1500 // i1 inputs are not supported in GlobalISel.
1501 if (Size == 1)
1502 return false;
1503
1504 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1505 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1506 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1507 I.eraseFromParent();
1508 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1509 }
1510
1511 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1512 if (Opcode == -1)
1513 return false;
1514
1515 MachineInstrBuilder SelectedMI;
1516 MachineOperand &LHS = I.getOperand(2);
1517 MachineOperand &RHS = I.getOperand(3);
1518 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1519 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1520 Register Src0Reg =
1521 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1522 Register Src1Reg =
1523 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1524 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1525 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1526 SelectedMI.addImm(Src0Mods);
1527 SelectedMI.addReg(Src0Reg);
1528 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1529 SelectedMI.addImm(Src1Mods);
1530 SelectedMI.addReg(Src1Reg);
1531 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1532 SelectedMI.addImm(0); // clamp
1533 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1534 SelectedMI.addImm(0); // op_sel
1535
1536 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1537 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1538 return false;
1539
1540 I.eraseFromParent();
1541 return true;
1542}
1543
1544// Ballot has to zero bits in input lane-mask that are zero in current exec,
1545// Done as AND with exec. For inputs that are results of instruction that
1546// implicitly use same exec, for example compares in same basic block or SCC to
1547// VCC copy, use copy.
1550 MachineInstr *MI = MRI.getVRegDef(Reg);
1551 if (MI->getParent() != MBB)
1552 return false;
1553
1554 // Lane mask generated by SCC to VCC copy.
1555 if (MI->getOpcode() == AMDGPU::COPY) {
1556 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1557 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1558 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1559 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1560 return true;
1561 }
1562
1563 // Lane mask generated using compare with same exec.
1564 if (isa<GAnyCmp>(MI))
1565 return true;
1566
1567 Register LHS, RHS;
1568 // Look through AND.
1569 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1570 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1572
1573 return false;
1574}
1575
1576bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1577 MachineBasicBlock *BB = I.getParent();
1578 const DebugLoc &DL = I.getDebugLoc();
1579 Register DstReg = I.getOperand(0).getReg();
1580 Register SrcReg = I.getOperand(2).getReg();
1581 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1582 const unsigned WaveSize = STI.getWavefrontSize();
1583
1584 // In the common case, the return type matches the wave size.
1585 // However we also support emitting i64 ballots in wave32 mode.
1586 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1587 return false;
1588
1589 std::optional<ValueAndVReg> Arg =
1591
1592 Register Dst = DstReg;
1593 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1594 if (BallotSize != WaveSize) {
1595 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1596 }
1597
1598 if (Arg) {
1599 const int64_t Value = Arg->Value.getZExtValue();
1600 if (Value == 0) {
1601 // Dst = S_MOV 0
1602 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1603 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1604 } else {
1605 // Dst = COPY EXEC
1606 assert(Value == 1);
1607 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1608 }
1609 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1610 return false;
1611 } else {
1612 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1613 // Dst = COPY SrcReg
1614 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1615 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1616 return false;
1617 } else {
1618 // Dst = S_AND SrcReg, EXEC
1619 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1620 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1621 .addReg(SrcReg)
1622 .addReg(TRI.getExec())
1623 .setOperandDead(3); // Dead scc
1624 if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
1625 return false;
1626 }
1627 }
1628
1629 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1630 if (BallotSize != WaveSize) {
1631 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1632 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1633 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1634 .addReg(Dst)
1635 .addImm(AMDGPU::sub0)
1636 .addReg(HiReg)
1637 .addImm(AMDGPU::sub1);
1638 }
1639
1640 I.eraseFromParent();
1641 return true;
1642}
1643
1644bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1645 Register DstReg = I.getOperand(0).getReg();
1646 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1647 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1648 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1649 return false;
1650
1651 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1652
1654 const MDNode *Metadata = I.getOperand(2).getMetadata();
1655 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1656 auto *RelocSymbol = cast<GlobalVariable>(
1657 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1658
1659 MachineBasicBlock *BB = I.getParent();
1660 BuildMI(*BB, &I, I.getDebugLoc(),
1661 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1663
1664 I.eraseFromParent();
1665 return true;
1666}
1667
1668bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1670
1671 Register DstReg = I.getOperand(0).getReg();
1672 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1673 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1674 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1675
1676 MachineBasicBlock *MBB = I.getParent();
1677 const DebugLoc &DL = I.getDebugLoc();
1678
1679 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1680
1681 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1683 MIB.addImm(MFI->getLDSSize());
1684 } else {
1686 const GlobalValue *GV =
1687 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1689 }
1690
1691 I.eraseFromParent();
1692 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1693}
1694
1695bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1696 MachineBasicBlock *MBB = I.getParent();
1698 const DebugLoc &DL = I.getDebugLoc();
1699
1700 MachineOperand &Dst = I.getOperand(0);
1701 Register DstReg = Dst.getReg();
1702 unsigned Depth = I.getOperand(2).getImm();
1703
1704 const TargetRegisterClass *RC
1705 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1706 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1707 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1708 return false;
1709
1710 // Check for kernel and shader functions
1711 if (Depth != 0 ||
1713 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1714 .addImm(0);
1715 I.eraseFromParent();
1716 return true;
1717 }
1718
1720 // There is a call to @llvm.returnaddress in this function
1721 MFI.setReturnAddressIsTaken(true);
1722
1723 // Get the return address reg and mark it as an implicit live-in
1724 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1725 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1726 AMDGPU::SReg_64RegClass, DL);
1727 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1728 .addReg(LiveIn);
1729 I.eraseFromParent();
1730 return true;
1731}
1732
1733bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1734 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1735 // SelectionDAG uses for wave32 vs wave64.
1736 MachineBasicBlock *BB = MI.getParent();
1737 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1738 .add(MI.getOperand(1));
1739
1740 Register Reg = MI.getOperand(1).getReg();
1741 MI.eraseFromParent();
1742
1743 if (!MRI->getRegClassOrNull(Reg))
1744 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1745 return true;
1746}
1747
1748bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1749 MachineInstr &MI, Intrinsic::ID IntrID) const {
1750 MachineBasicBlock *MBB = MI.getParent();
1752 const DebugLoc &DL = MI.getDebugLoc();
1753
1754 unsigned IndexOperand = MI.getOperand(7).getImm();
1755 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1756 bool WaveDone = MI.getOperand(9).getImm() != 0;
1757
1758 if (WaveDone && !WaveRelease)
1759 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1760
1761 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1762 IndexOperand &= ~0x3f;
1763 unsigned CountDw = 0;
1764
1766 CountDw = (IndexOperand >> 24) & 0xf;
1767 IndexOperand &= ~(0xf << 24);
1768
1769 if (CountDw < 1 || CountDw > 4) {
1771 "ds_ordered_count: dword count must be between 1 and 4");
1772 }
1773 }
1774
1775 if (IndexOperand)
1776 report_fatal_error("ds_ordered_count: bad index operand");
1777
1778 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1779 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1780
1781 unsigned Offset0 = OrderedCountIndex << 2;
1782 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1783
1785 Offset1 |= (CountDw - 1) << 6;
1786
1788 Offset1 |= ShaderType << 2;
1789
1790 unsigned Offset = Offset0 | (Offset1 << 8);
1791
1792 Register M0Val = MI.getOperand(2).getReg();
1793 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1794 .addReg(M0Val);
1795
1796 Register DstReg = MI.getOperand(0).getReg();
1797 Register ValReg = MI.getOperand(3).getReg();
1799 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1800 .addReg(ValReg)
1801 .addImm(Offset)
1802 .cloneMemRefs(MI);
1803
1804 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1805 return false;
1806
1807 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1808 MI.eraseFromParent();
1809 return Ret;
1810}
1811
1812static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1813 switch (IntrID) {
1814 case Intrinsic::amdgcn_ds_gws_init:
1815 return AMDGPU::DS_GWS_INIT;
1816 case Intrinsic::amdgcn_ds_gws_barrier:
1817 return AMDGPU::DS_GWS_BARRIER;
1818 case Intrinsic::amdgcn_ds_gws_sema_v:
1819 return AMDGPU::DS_GWS_SEMA_V;
1820 case Intrinsic::amdgcn_ds_gws_sema_br:
1821 return AMDGPU::DS_GWS_SEMA_BR;
1822 case Intrinsic::amdgcn_ds_gws_sema_p:
1823 return AMDGPU::DS_GWS_SEMA_P;
1824 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1825 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1826 default:
1827 llvm_unreachable("not a gws intrinsic");
1828 }
1829}
1830
1831bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1832 Intrinsic::ID IID) const {
1833 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1834 !STI.hasGWSSemaReleaseAll()))
1835 return false;
1836
1837 // intrinsic ID, vsrc, offset
1838 const bool HasVSrc = MI.getNumOperands() == 3;
1839 assert(HasVSrc || MI.getNumOperands() == 2);
1840
1841 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1842 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1843 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1844 return false;
1845
1846 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1847 unsigned ImmOffset;
1848
1849 MachineBasicBlock *MBB = MI.getParent();
1850 const DebugLoc &DL = MI.getDebugLoc();
1851
1852 MachineInstr *Readfirstlane = nullptr;
1853
1854 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1855 // incoming offset, in case there's an add of a constant. We'll have to put it
1856 // back later.
1857 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1858 Readfirstlane = OffsetDef;
1859 BaseOffset = OffsetDef->getOperand(1).getReg();
1860 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1861 }
1862
1863 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1864 // If we have a constant offset, try to use the 0 in m0 as the base.
1865 // TODO: Look into changing the default m0 initialization value. If the
1866 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1867 // the immediate offset.
1868
1869 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1870 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1871 .addImm(0);
1872 } else {
1873 std::tie(BaseOffset, ImmOffset) =
1874 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
1875
1876 if (Readfirstlane) {
1877 // We have the constant offset now, so put the readfirstlane back on the
1878 // variable component.
1879 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1880 return false;
1881
1882 Readfirstlane->getOperand(1).setReg(BaseOffset);
1883 BaseOffset = Readfirstlane->getOperand(0).getReg();
1884 } else {
1885 if (!RBI.constrainGenericRegister(BaseOffset,
1886 AMDGPU::SReg_32RegClass, *MRI))
1887 return false;
1888 }
1889
1890 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1891 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1892 .addReg(BaseOffset)
1893 .addImm(16)
1894 .setOperandDead(3); // Dead scc
1895
1896 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1897 .addReg(M0Base);
1898 }
1899
1900 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1901 // offset field) % 64. Some versions of the programming guide omit the m0
1902 // part, or claim it's from offset 0.
1903 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1904
1905 if (HasVSrc) {
1906 Register VSrc = MI.getOperand(1).getReg();
1907 MIB.addReg(VSrc);
1908
1909 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1910 return false;
1911 }
1912
1913 MIB.addImm(ImmOffset)
1914 .cloneMemRefs(MI);
1915
1916 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1917
1918 MI.eraseFromParent();
1919 return true;
1920}
1921
1922bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1923 bool IsAppend) const {
1924 Register PtrBase = MI.getOperand(2).getReg();
1925 LLT PtrTy = MRI->getType(PtrBase);
1926 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1927
1928 unsigned Offset;
1929 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1930
1931 // TODO: Should this try to look through readfirstlane like GWS?
1932 if (!isDSOffsetLegal(PtrBase, Offset)) {
1933 PtrBase = MI.getOperand(2).getReg();
1934 Offset = 0;
1935 }
1936
1937 MachineBasicBlock *MBB = MI.getParent();
1938 const DebugLoc &DL = MI.getDebugLoc();
1939 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1940
1941 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1942 .addReg(PtrBase);
1943 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1944 return false;
1945
1946 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1947 .addImm(Offset)
1948 .addImm(IsGDS ? -1 : 0)
1949 .cloneMemRefs(MI);
1950 MI.eraseFromParent();
1951 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1952}
1953
1954bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
1955 MachineFunction *MF = MI.getParent()->getParent();
1957
1958 MFInfo->setInitWholeWave();
1959 return selectImpl(MI, *CoverageInfo);
1960}
1961
1962bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1963 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
1965 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1966 if (WGSize <= STI.getWavefrontSize()) {
1967 // If the workgroup fits in a wave, remove s_barrier_signal and lower
1968 // s_barrier/s_barrier_wait to wave_barrier.
1969 if (IntrinsicID == Intrinsic::amdgcn_s_barrier ||
1970 IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) {
1971 MachineBasicBlock *MBB = MI.getParent();
1972 const DebugLoc &DL = MI.getDebugLoc();
1973 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1974 }
1975 MI.eraseFromParent();
1976 return true;
1977 }
1978 }
1979
1980 if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
1981 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1982 MachineBasicBlock *MBB = MI.getParent();
1983 const DebugLoc &DL = MI.getDebugLoc();
1984 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
1986 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
1988 MI.eraseFromParent();
1989 return true;
1990 }
1991
1992 return selectImpl(MI, *CoverageInfo);
1993}
1994
1995static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1996 bool &IsTexFail) {
1997 if (TexFailCtrl)
1998 IsTexFail = true;
1999
2000 TFE = (TexFailCtrl & 0x1) ? true : false;
2001 TexFailCtrl &= ~(uint64_t)0x1;
2002 LWE = (TexFailCtrl & 0x2) ? true : false;
2003 TexFailCtrl &= ~(uint64_t)0x2;
2004
2005 return TexFailCtrl == 0;
2006}
2007
2008bool AMDGPUInstructionSelector::selectImageIntrinsic(
2010 MachineBasicBlock *MBB = MI.getParent();
2011 const DebugLoc &DL = MI.getDebugLoc();
2012
2013 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2015
2016 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2017 unsigned IntrOpcode = Intr->BaseOpcode;
2018 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2019 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2020 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2021
2022 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2023
2024 Register VDataIn, VDataOut;
2025 LLT VDataTy;
2026 int NumVDataDwords = -1;
2027 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2028 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2029
2030 bool Unorm;
2031 if (!BaseOpcode->Sampler)
2032 Unorm = true;
2033 else
2034 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2035
2036 bool TFE;
2037 bool LWE;
2038 bool IsTexFail = false;
2039 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2040 TFE, LWE, IsTexFail))
2041 return false;
2042
2043 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2044 const bool IsA16 = (Flags & 1) != 0;
2045 const bool IsG16 = (Flags & 2) != 0;
2046
2047 // A16 implies 16 bit gradients if subtarget doesn't support G16
2048 if (IsA16 && !STI.hasG16() && !IsG16)
2049 return false;
2050
2051 unsigned DMask = 0;
2052 unsigned DMaskLanes = 0;
2053
2054 if (BaseOpcode->Atomic) {
2055 VDataOut = MI.getOperand(0).getReg();
2056 VDataIn = MI.getOperand(2).getReg();
2057 LLT Ty = MRI->getType(VDataIn);
2058
2059 // Be careful to allow atomic swap on 16-bit element vectors.
2060 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2061 Ty.getSizeInBits() == 128 :
2062 Ty.getSizeInBits() == 64;
2063
2064 if (BaseOpcode->AtomicX2) {
2065 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2066
2067 DMask = Is64Bit ? 0xf : 0x3;
2068 NumVDataDwords = Is64Bit ? 4 : 2;
2069 } else {
2070 DMask = Is64Bit ? 0x3 : 0x1;
2071 NumVDataDwords = Is64Bit ? 2 : 1;
2072 }
2073 } else {
2074 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2075 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2076
2077 if (BaseOpcode->Store) {
2078 VDataIn = MI.getOperand(1).getReg();
2079 VDataTy = MRI->getType(VDataIn);
2080 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2081 } else if (BaseOpcode->NoReturn) {
2082 NumVDataDwords = 0;
2083 } else {
2084 VDataOut = MI.getOperand(0).getReg();
2085 VDataTy = MRI->getType(VDataOut);
2086 NumVDataDwords = DMaskLanes;
2087
2088 if (IsD16 && !STI.hasUnpackedD16VMem())
2089 NumVDataDwords = (DMaskLanes + 1) / 2;
2090 }
2091 }
2092
2093 // Set G16 opcode
2094 if (Subtarget->hasG16() && IsG16) {
2095 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2097 assert(G16MappingInfo);
2098 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2099 }
2100
2101 // TODO: Check this in verifier.
2102 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2103
2104 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2105 if (BaseOpcode->Atomic)
2106 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
2107 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2109 return false;
2110
2111 int NumVAddrRegs = 0;
2112 int NumVAddrDwords = 0;
2113 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2114 // Skip the $noregs and 0s inserted during legalization.
2115 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2116 if (!AddrOp.isReg())
2117 continue; // XXX - Break?
2118
2119 Register Addr = AddrOp.getReg();
2120 if (!Addr)
2121 break;
2122
2123 ++NumVAddrRegs;
2124 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2125 }
2126
2127 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2128 // NSA, these should have been packed into a single value in the first
2129 // address register
2130 const bool UseNSA =
2131 NumVAddrRegs != 1 &&
2132 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2133 : NumVAddrDwords == NumVAddrRegs);
2134 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2135 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2136 return false;
2137 }
2138
2139 if (IsTexFail)
2140 ++NumVDataDwords;
2141
2142 int Opcode = -1;
2143 if (IsGFX12Plus) {
2144 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2145 NumVDataDwords, NumVAddrDwords);
2146 } else if (IsGFX11Plus) {
2147 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2148 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2149 : AMDGPU::MIMGEncGfx11Default,
2150 NumVDataDwords, NumVAddrDwords);
2151 } else if (IsGFX10Plus) {
2152 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2153 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2154 : AMDGPU::MIMGEncGfx10Default,
2155 NumVDataDwords, NumVAddrDwords);
2156 } else {
2157 if (Subtarget->hasGFX90AInsts()) {
2158 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2159 NumVDataDwords, NumVAddrDwords);
2160 if (Opcode == -1) {
2161 LLVM_DEBUG(
2162 dbgs()
2163 << "requested image instruction is not supported on this GPU\n");
2164 return false;
2165 }
2166 }
2167 if (Opcode == -1 &&
2169 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2170 NumVDataDwords, NumVAddrDwords);
2171 if (Opcode == -1)
2172 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2173 NumVDataDwords, NumVAddrDwords);
2174 }
2175 if (Opcode == -1)
2176 return false;
2177
2178 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2179 .cloneMemRefs(MI);
2180
2181 if (VDataOut) {
2182 if (BaseOpcode->AtomicX2) {
2183 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2184
2185 Register TmpReg = MRI->createVirtualRegister(
2186 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2187 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2188
2189 MIB.addDef(TmpReg);
2190 if (!MRI->use_empty(VDataOut)) {
2191 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2192 .addReg(TmpReg, RegState::Kill, SubReg);
2193 }
2194
2195 } else {
2196 MIB.addDef(VDataOut); // vdata output
2197 }
2198 }
2199
2200 if (VDataIn)
2201 MIB.addReg(VDataIn); // vdata input
2202
2203 for (int I = 0; I != NumVAddrRegs; ++I) {
2204 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2205 if (SrcOp.isReg()) {
2206 assert(SrcOp.getReg() != 0);
2207 MIB.addReg(SrcOp.getReg());
2208 }
2209 }
2210
2211 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2212 if (BaseOpcode->Sampler)
2213 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2214
2215 MIB.addImm(DMask); // dmask
2216
2217 if (IsGFX10Plus)
2218 MIB.addImm(DimInfo->Encoding);
2219 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2220 MIB.addImm(Unorm);
2221
2222 MIB.addImm(CPol);
2223 MIB.addImm(IsA16 && // a16 or r128
2224 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2225 if (IsGFX10Plus)
2226 MIB.addImm(IsA16 ? -1 : 0);
2227
2228 if (!Subtarget->hasGFX90AInsts()) {
2229 MIB.addImm(TFE); // tfe
2230 } else if (TFE) {
2231 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2232 return false;
2233 }
2234
2235 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2236 MIB.addImm(LWE); // lwe
2237 if (!IsGFX10Plus)
2238 MIB.addImm(DimInfo->DA ? -1 : 0);
2239 if (BaseOpcode->HasD16)
2240 MIB.addImm(IsD16 ? -1 : 0);
2241
2242 MI.eraseFromParent();
2243 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2244 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2245 return true;
2246}
2247
2248// We need to handle this here because tablegen doesn't support matching
2249// instructions with multiple outputs.
2250bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2251 MachineInstr &MI) const {
2252 Register Dst0 = MI.getOperand(0).getReg();
2253 Register Dst1 = MI.getOperand(1).getReg();
2254
2255 const DebugLoc &DL = MI.getDebugLoc();
2256 MachineBasicBlock *MBB = MI.getParent();
2257
2258 Register Addr = MI.getOperand(3).getReg();
2259 Register Data0 = MI.getOperand(4).getReg();
2260 Register Data1 = MI.getOperand(5).getReg();
2261 unsigned Offset = MI.getOperand(6).getImm();
2262
2263 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2264 .addDef(Dst1)
2265 .addUse(Addr)
2266 .addUse(Data0)
2267 .addUse(Data1)
2268 .addImm(Offset)
2269 .cloneMemRefs(MI);
2270
2271 MI.eraseFromParent();
2272 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2273}
2274
2275bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2276 MachineInstr &I) const {
2277 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2278 switch (IntrinsicID) {
2279 case Intrinsic::amdgcn_end_cf:
2280 return selectEndCfIntrinsic(I);
2281 case Intrinsic::amdgcn_ds_ordered_add:
2282 case Intrinsic::amdgcn_ds_ordered_swap:
2283 return selectDSOrderedIntrinsic(I, IntrinsicID);
2284 case Intrinsic::amdgcn_ds_gws_init:
2285 case Intrinsic::amdgcn_ds_gws_barrier:
2286 case Intrinsic::amdgcn_ds_gws_sema_v:
2287 case Intrinsic::amdgcn_ds_gws_sema_br:
2288 case Intrinsic::amdgcn_ds_gws_sema_p:
2289 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2290 return selectDSGWSIntrinsic(I, IntrinsicID);
2291 case Intrinsic::amdgcn_ds_append:
2292 return selectDSAppendConsume(I, true);
2293 case Intrinsic::amdgcn_ds_consume:
2294 return selectDSAppendConsume(I, false);
2295 case Intrinsic::amdgcn_init_whole_wave:
2296 return selectInitWholeWave(I);
2297 case Intrinsic::amdgcn_s_barrier:
2298 case Intrinsic::amdgcn_s_barrier_signal:
2299 case Intrinsic::amdgcn_s_barrier_wait:
2300 return selectSBarrier(I);
2301 case Intrinsic::amdgcn_raw_buffer_load_lds:
2302 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2303 case Intrinsic::amdgcn_struct_buffer_load_lds:
2304 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2305 return selectBufferLoadLds(I);
2306 case Intrinsic::amdgcn_global_load_lds:
2307 return selectGlobalLoadLds(I);
2308 case Intrinsic::amdgcn_exp_compr:
2309 if (!STI.hasCompressedExport()) {
2310 Function &F = I.getMF()->getFunction();
2312 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2313 F.getContext().diagnose(NoFpRet);
2314 return false;
2315 }
2316 break;
2317 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2318 return selectDSBvhStackIntrinsic(I);
2319 case Intrinsic::amdgcn_s_barrier_init:
2320 case Intrinsic::amdgcn_s_barrier_signal_var:
2321 return selectNamedBarrierInit(I, IntrinsicID);
2322 case Intrinsic::amdgcn_s_barrier_join:
2323 case Intrinsic::amdgcn_s_get_named_barrier_state:
2324 return selectNamedBarrierInst(I, IntrinsicID);
2325 case Intrinsic::amdgcn_s_get_barrier_state:
2326 return selectSGetBarrierState(I, IntrinsicID);
2327 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2328 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2329 }
2330 return selectImpl(I, *CoverageInfo);
2331}
2332
2333bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2334 if (selectImpl(I, *CoverageInfo))
2335 return true;
2336
2337 MachineBasicBlock *BB = I.getParent();
2338 const DebugLoc &DL = I.getDebugLoc();
2339
2340 Register DstReg = I.getOperand(0).getReg();
2341 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2342 assert(Size <= 32 || Size == 64);
2343 const MachineOperand &CCOp = I.getOperand(1);
2344 Register CCReg = CCOp.getReg();
2345 if (!isVCC(CCReg, *MRI)) {
2346 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2347 AMDGPU::S_CSELECT_B32;
2348 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2349 .addReg(CCReg);
2350
2351 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2352 // bank, because it does not cover the register class that we used to represent
2353 // for it. So we need to manually set the register class here.
2354 if (!MRI->getRegClassOrNull(CCReg))
2355 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2356 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2357 .add(I.getOperand(2))
2358 .add(I.getOperand(3));
2359
2360 bool Ret = false;
2361 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2362 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2363 I.eraseFromParent();
2364 return Ret;
2365 }
2366
2367 // Wide VGPR select should have been split in RegBankSelect.
2368 if (Size > 32)
2369 return false;
2370
2372 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2373 .addImm(0)
2374 .add(I.getOperand(3))
2375 .addImm(0)
2376 .add(I.getOperand(2))
2377 .add(I.getOperand(1));
2378
2379 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2380 I.eraseFromParent();
2381 return Ret;
2382}
2383
2384bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2385 Register DstReg = I.getOperand(0).getReg();
2386 Register SrcReg = I.getOperand(1).getReg();
2387 const LLT DstTy = MRI->getType(DstReg);
2388 const LLT SrcTy = MRI->getType(SrcReg);
2389 const LLT S1 = LLT::scalar(1);
2390
2391 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2392 const RegisterBank *DstRB;
2393 if (DstTy == S1) {
2394 // This is a special case. We don't treat s1 for legalization artifacts as
2395 // vcc booleans.
2396 DstRB = SrcRB;
2397 } else {
2398 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2399 if (SrcRB != DstRB)
2400 return false;
2401 }
2402
2403 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2404
2405 unsigned DstSize = DstTy.getSizeInBits();
2406 unsigned SrcSize = SrcTy.getSizeInBits();
2407
2408 const TargetRegisterClass *SrcRC =
2409 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2410 const TargetRegisterClass *DstRC =
2411 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2412 if (!SrcRC || !DstRC)
2413 return false;
2414
2415 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2416 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2417 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2418 return false;
2419 }
2420
2421 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2423 const DebugLoc &DL = I.getDebugLoc();
2424 MachineBasicBlock *MBB = I.getParent();
2425 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2426 .addReg(SrcReg, 0, AMDGPU::lo16);
2427 I.eraseFromParent();
2428 return true;
2429 }
2430
2431 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2432 MachineBasicBlock *MBB = I.getParent();
2433 const DebugLoc &DL = I.getDebugLoc();
2434
2435 Register LoReg = MRI->createVirtualRegister(DstRC);
2436 Register HiReg = MRI->createVirtualRegister(DstRC);
2437 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2438 .addReg(SrcReg, 0, AMDGPU::sub0);
2439 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2440 .addReg(SrcReg, 0, AMDGPU::sub1);
2441
2442 if (IsVALU && STI.hasSDWA()) {
2443 // Write the low 16-bits of the high element into the high 16-bits of the
2444 // low element.
2445 MachineInstr *MovSDWA =
2446 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2447 .addImm(0) // $src0_modifiers
2448 .addReg(HiReg) // $src0
2449 .addImm(0) // $clamp
2450 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2451 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2452 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2453 .addReg(LoReg, RegState::Implicit);
2454 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2455 } else {
2456 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2457 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2458 Register ImmReg = MRI->createVirtualRegister(DstRC);
2459 if (IsVALU) {
2460 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2461 .addImm(16)
2462 .addReg(HiReg);
2463 } else {
2464 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2465 .addReg(HiReg)
2466 .addImm(16)
2467 .setOperandDead(3); // Dead scc
2468 }
2469
2470 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2471 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2472 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2473
2474 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2475 .addImm(0xffff);
2476 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2477 .addReg(LoReg)
2478 .addReg(ImmReg);
2479 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2480 .addReg(TmpReg0)
2481 .addReg(TmpReg1);
2482
2483 if (!IsVALU) {
2484 And.setOperandDead(3); // Dead scc
2485 Or.setOperandDead(3); // Dead scc
2486 }
2487 }
2488
2489 I.eraseFromParent();
2490 return true;
2491 }
2492
2493 if (!DstTy.isScalar())
2494 return false;
2495
2496 if (SrcSize > 32) {
2497 unsigned SubRegIdx =
2498 DstSize < 32 ? AMDGPU::sub0 : TRI.getSubRegFromChannel(0, DstSize / 32);
2499 if (SubRegIdx == AMDGPU::NoSubRegister)
2500 return false;
2501
2502 // Deal with weird cases where the class only partially supports the subreg
2503 // index.
2504 const TargetRegisterClass *SrcWithSubRC
2505 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2506 if (!SrcWithSubRC)
2507 return false;
2508
2509 if (SrcWithSubRC != SrcRC) {
2510 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2511 return false;
2512 }
2513
2514 I.getOperand(1).setSubReg(SubRegIdx);
2515 }
2516
2517 I.setDesc(TII.get(TargetOpcode::COPY));
2518 return true;
2519}
2520
2521/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2522static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2523 Mask = maskTrailingOnes<unsigned>(Size);
2524 int SignedMask = static_cast<int>(Mask);
2525 return SignedMask >= -16 && SignedMask <= 64;
2526}
2527
2528// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2529const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2530 Register Reg, const MachineRegisterInfo &MRI,
2531 const TargetRegisterInfo &TRI) const {
2532 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2533 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2534 return RB;
2535
2536 // Ignore the type, since we don't use vcc in artifacts.
2537 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2538 return &RBI.getRegBankFromRegClass(*RC, LLT());
2539 return nullptr;
2540}
2541
2542bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2543 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2544 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2545 const DebugLoc &DL = I.getDebugLoc();
2546 MachineBasicBlock &MBB = *I.getParent();
2547 const Register DstReg = I.getOperand(0).getReg();
2548 const Register SrcReg = I.getOperand(1).getReg();
2549
2550 const LLT DstTy = MRI->getType(DstReg);
2551 const LLT SrcTy = MRI->getType(SrcReg);
2552 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2553 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2554 const unsigned DstSize = DstTy.getSizeInBits();
2555 if (!DstTy.isScalar())
2556 return false;
2557
2558 // Artifact casts should never use vcc.
2559 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2560
2561 // FIXME: This should probably be illegal and split earlier.
2562 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2563 if (DstSize <= 32)
2564 return selectCOPY(I);
2565
2566 const TargetRegisterClass *SrcRC =
2567 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2568 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2569 const TargetRegisterClass *DstRC =
2570 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2571
2572 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2573 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2574 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2575 .addReg(SrcReg)
2576 .addImm(AMDGPU::sub0)
2577 .addReg(UndefReg)
2578 .addImm(AMDGPU::sub1);
2579 I.eraseFromParent();
2580
2581 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2582 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2583 }
2584
2585 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2586 // 64-bit should have been split up in RegBankSelect
2587
2588 // Try to use an and with a mask if it will save code size.
2589 unsigned Mask;
2590 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2591 MachineInstr *ExtI =
2592 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2593 .addImm(Mask)
2594 .addReg(SrcReg);
2595 I.eraseFromParent();
2596 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2597 }
2598
2599 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2600 MachineInstr *ExtI =
2601 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2602 .addReg(SrcReg)
2603 .addImm(0) // Offset
2604 .addImm(SrcSize); // Width
2605 I.eraseFromParent();
2606 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2607 }
2608
2609 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2610 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2611 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2612 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2613 return false;
2614
2615 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2616 const unsigned SextOpc = SrcSize == 8 ?
2617 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2618 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2619 .addReg(SrcReg);
2620 I.eraseFromParent();
2621 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2622 }
2623
2624 // Using a single 32-bit SALU to calculate the high half is smaller than
2625 // S_BFE with a literal constant operand.
2626 if (DstSize > 32 && SrcSize == 32) {
2627 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2628 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2629 if (Signed) {
2630 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2631 .addReg(SrcReg, 0, SubReg)
2632 .addImm(31)
2633 .setOperandDead(3); // Dead scc
2634 } else {
2635 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2636 .addImm(0);
2637 }
2638 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2639 .addReg(SrcReg, 0, SubReg)
2640 .addImm(AMDGPU::sub0)
2641 .addReg(HiReg)
2642 .addImm(AMDGPU::sub1);
2643 I.eraseFromParent();
2644 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2645 *MRI);
2646 }
2647
2648 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2649 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2650
2651 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2652 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2653 // We need a 64-bit register source, but the high bits don't matter.
2654 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2655 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2656 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2657
2658 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2659 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2660 .addReg(SrcReg, 0, SubReg)
2661 .addImm(AMDGPU::sub0)
2662 .addReg(UndefReg)
2663 .addImm(AMDGPU::sub1);
2664
2665 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2666 .addReg(ExtReg)
2667 .addImm(SrcSize << 16);
2668
2669 I.eraseFromParent();
2670 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2671 }
2672
2673 unsigned Mask;
2674 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2675 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2676 .addReg(SrcReg)
2677 .addImm(Mask)
2678 .setOperandDead(3); // Dead scc
2679 } else {
2680 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2681 .addReg(SrcReg)
2682 .addImm(SrcSize << 16);
2683 }
2684
2685 I.eraseFromParent();
2686 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2687 }
2688
2689 return false;
2690}
2691
2693 return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
2694}
2695
2697 Register BitcastSrc;
2698 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2699 Reg = BitcastSrc;
2700 return Reg;
2701}
2702
2704 Register &Out) {
2705 Register Trunc;
2706 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2707 return false;
2708
2709 Register LShlSrc;
2710 Register Cst;
2711 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2712 Cst = stripCopy(Cst, MRI);
2713 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2714 Out = stripBitCast(LShlSrc, MRI);
2715 return true;
2716 }
2717 }
2718
2719 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2720 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2721 return false;
2722
2723 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2724 LLT::fixed_vector(2, 16));
2725
2726 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2727 assert(Mask.size() == 2);
2728
2729 if (Mask[0] == 1 && Mask[1] <= 1) {
2730 Out = Shuffle->getOperand(0).getReg();
2731 return true;
2732 }
2733
2734 return false;
2735}
2736
2737bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2738 if (!Subtarget->hasSALUFloatInsts())
2739 return false;
2740
2741 Register Dst = I.getOperand(0).getReg();
2742 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2743 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2744 return false;
2745
2746 Register Src = I.getOperand(1).getReg();
2747
2748 if (MRI->getType(Dst) == LLT::scalar(32) &&
2749 MRI->getType(Src) == LLT::scalar(16)) {
2750 if (isExtractHiElt(*MRI, Src, Src)) {
2751 MachineBasicBlock *BB = I.getParent();
2752 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2753 .addUse(Src);
2754 I.eraseFromParent();
2755 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2756 }
2757 }
2758
2759 return false;
2760}
2761
2762bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2763 // Only manually handle the f64 SGPR case.
2764 //
2765 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2766 // the bit ops theoretically have a second result due to the implicit def of
2767 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2768 // that is easy by disabling the check. The result works, but uses a
2769 // nonsensical sreg32orlds_and_sreg_1 regclass.
2770 //
2771 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2772 // the variadic REG_SEQUENCE operands.
2773
2774 Register Dst = MI.getOperand(0).getReg();
2775 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2776 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2777 MRI->getType(Dst) != LLT::scalar(64))
2778 return false;
2779
2780 Register Src = MI.getOperand(1).getReg();
2781 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2782 if (Fabs)
2783 Src = Fabs->getOperand(1).getReg();
2784
2785 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2786 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2787 return false;
2788
2789 MachineBasicBlock *BB = MI.getParent();
2790 const DebugLoc &DL = MI.getDebugLoc();
2791 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2792 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2793 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2794 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2795
2796 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2797 .addReg(Src, 0, AMDGPU::sub0);
2798 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2799 .addReg(Src, 0, AMDGPU::sub1);
2800 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2801 .addImm(0x80000000);
2802
2803 // Set or toggle sign bit.
2804 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2805 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2806 .addReg(HiReg)
2807 .addReg(ConstReg)
2808 .setOperandDead(3); // Dead scc
2809 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2810 .addReg(LoReg)
2811 .addImm(AMDGPU::sub0)
2812 .addReg(OpReg)
2813 .addImm(AMDGPU::sub1);
2814 MI.eraseFromParent();
2815 return true;
2816}
2817
2818// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2819bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2820 Register Dst = MI.getOperand(0).getReg();
2821 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2822 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2823 MRI->getType(Dst) != LLT::scalar(64))
2824 return false;
2825
2826 Register Src = MI.getOperand(1).getReg();
2827 MachineBasicBlock *BB = MI.getParent();
2828 const DebugLoc &DL = MI.getDebugLoc();
2829 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2830 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2831 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2832 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2833
2834 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2835 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2836 return false;
2837
2838 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2839 .addReg(Src, 0, AMDGPU::sub0);
2840 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2841 .addReg(Src, 0, AMDGPU::sub1);
2842 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2843 .addImm(0x7fffffff);
2844
2845 // Clear sign bit.
2846 // TODO: Should this used S_BITSET0_*?
2847 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2848 .addReg(HiReg)
2849 .addReg(ConstReg)
2850 .setOperandDead(3); // Dead scc
2851 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2852 .addReg(LoReg)
2853 .addImm(AMDGPU::sub0)
2854 .addReg(OpReg)
2855 .addImm(AMDGPU::sub1);
2856
2857 MI.eraseFromParent();
2858 return true;
2859}
2860
2861static bool isConstant(const MachineInstr &MI) {
2862 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2863}
2864
2865void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2866 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2867
2868 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2869 const MachineInstr *PtrMI =
2870 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2871
2872 assert(PtrMI);
2873
2874 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2875 return;
2876
2877 GEPInfo GEPInfo;
2878
2879 for (unsigned i = 1; i != 3; ++i) {
2880 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2881 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2882 assert(OpDef);
2883 if (i == 2 && isConstant(*OpDef)) {
2884 // TODO: Could handle constant base + variable offset, but a combine
2885 // probably should have commuted it.
2886 assert(GEPInfo.Imm == 0);
2887 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2888 continue;
2889 }
2890 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2891 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2892 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2893 else
2894 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2895 }
2896
2897 AddrInfo.push_back(GEPInfo);
2898 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2899}
2900
2901bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2902 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2903}
2904
2905bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2906 if (!MI.hasOneMemOperand())
2907 return false;
2908
2909 const MachineMemOperand *MMO = *MI.memoperands_begin();
2910 const Value *Ptr = MMO->getValue();
2911
2912 // UndefValue means this is a load of a kernel input. These are uniform.
2913 // Sometimes LDS instructions have constant pointers.
2914 // If Ptr is null, then that means this mem operand contains a
2915 // PseudoSourceValue like GOT.
2916 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2917 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2918 return true;
2919
2921 return true;
2922
2923 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2924 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2925 AMDGPU::SGPRRegBankID;
2926
2927 const Instruction *I = dyn_cast<Instruction>(Ptr);
2928 return I && I->getMetadata("amdgpu.uniform");
2929}
2930
2931bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2932 for (const GEPInfo &GEPInfo : AddrInfo) {
2933 if (!GEPInfo.VgprParts.empty())
2934 return true;
2935 }
2936 return false;
2937}
2938
2939void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2940 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2941 unsigned AS = PtrTy.getAddressSpace();
2943 STI.ldsRequiresM0Init()) {
2944 MachineBasicBlock *BB = I.getParent();
2945
2946 // If DS instructions require M0 initialization, insert it before selecting.
2947 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2948 .addImm(-1);
2949 }
2950}
2951
2952bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2953 MachineInstr &I) const {
2954 initM0(I);
2955 return selectImpl(I, *CoverageInfo);
2956}
2957
2959 if (Reg.isPhysical())
2960 return false;
2961
2962 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2963 const unsigned Opcode = MI.getOpcode();
2964
2965 if (Opcode == AMDGPU::COPY)
2966 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2967
2968 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2969 Opcode == AMDGPU::G_XOR)
2970 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2971 isVCmpResult(MI.getOperand(2).getReg(), MRI);
2972
2973 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2974 return GI->is(Intrinsic::amdgcn_class);
2975
2976 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2977}
2978
2979bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2980 MachineBasicBlock *BB = I.getParent();
2981 MachineOperand &CondOp = I.getOperand(0);
2982 Register CondReg = CondOp.getReg();
2983 const DebugLoc &DL = I.getDebugLoc();
2984
2985 unsigned BrOpcode;
2986 Register CondPhysReg;
2987 const TargetRegisterClass *ConstrainRC;
2988
2989 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2990 // whether the branch is uniform when selecting the instruction. In
2991 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2992 // RegBankSelect knows what it's doing if the branch condition is scc, even
2993 // though it currently does not.
2994 if (!isVCC(CondReg, *MRI)) {
2995 if (MRI->getType(CondReg) != LLT::scalar(32))
2996 return false;
2997
2998 CondPhysReg = AMDGPU::SCC;
2999 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3000 ConstrainRC = &AMDGPU::SReg_32RegClass;
3001 } else {
3002 // FIXME: Should scc->vcc copies and with exec?
3003
3004 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3005 // need to insert an and with exec.
3006 if (!isVCmpResult(CondReg, *MRI)) {
3007 const bool Is64 = STI.isWave64();
3008 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3009 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3010
3011 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3012 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3013 .addReg(CondReg)
3014 .addReg(Exec)
3015 .setOperandDead(3); // Dead scc
3016 CondReg = TmpReg;
3017 }
3018
3019 CondPhysReg = TRI.getVCC();
3020 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3021 ConstrainRC = TRI.getBoolRC();
3022 }
3023
3024 if (!MRI->getRegClassOrNull(CondReg))
3025 MRI->setRegClass(CondReg, ConstrainRC);
3026
3027 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3028 .addReg(CondReg);
3029 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3030 .addMBB(I.getOperand(1).getMBB());
3031
3032 I.eraseFromParent();
3033 return true;
3034}
3035
3036bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3037 MachineInstr &I) const {
3038 Register DstReg = I.getOperand(0).getReg();
3039 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3040 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3041 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3042 if (IsVGPR)
3043 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3044
3045 return RBI.constrainGenericRegister(
3046 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3047}
3048
3049bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3050 Register DstReg = I.getOperand(0).getReg();
3051 Register SrcReg = I.getOperand(1).getReg();
3052 Register MaskReg = I.getOperand(2).getReg();
3053 LLT Ty = MRI->getType(DstReg);
3054 LLT MaskTy = MRI->getType(MaskReg);
3055 MachineBasicBlock *BB = I.getParent();
3056 const DebugLoc &DL = I.getDebugLoc();
3057
3058 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3059 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3060 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3061 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3062 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3063 return false;
3064
3065 // Try to avoid emitting a bit operation when we only need to touch half of
3066 // the 64-bit pointer.
3067 APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
3068 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3069 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3070
3071 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3072 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3073
3074 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3075 !CanCopyLow32 && !CanCopyHi32) {
3076 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3077 .addReg(SrcReg)
3078 .addReg(MaskReg)
3079 .setOperandDead(3); // Dead scc
3080 I.eraseFromParent();
3081 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3082 }
3083
3084 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3085 const TargetRegisterClass &RegRC
3086 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3087
3088 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3089 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3090 const TargetRegisterClass *MaskRC =
3091 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3092
3093 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3094 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3095 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3096 return false;
3097
3098 if (Ty.getSizeInBits() == 32) {
3099 assert(MaskTy.getSizeInBits() == 32 &&
3100 "ptrmask should have been narrowed during legalize");
3101
3102 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3103 .addReg(SrcReg)
3104 .addReg(MaskReg);
3105
3106 if (!IsVGPR)
3107 NewOp.setOperandDead(3); // Dead scc
3108 I.eraseFromParent();
3109 return true;
3110 }
3111
3112 Register HiReg = MRI->createVirtualRegister(&RegRC);
3113 Register LoReg = MRI->createVirtualRegister(&RegRC);
3114
3115 // Extract the subregisters from the source pointer.
3116 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3117 .addReg(SrcReg, 0, AMDGPU::sub0);
3118 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3119 .addReg(SrcReg, 0, AMDGPU::sub1);
3120
3121 Register MaskedLo, MaskedHi;
3122
3123 if (CanCopyLow32) {
3124 // If all the bits in the low half are 1, we only need a copy for it.
3125 MaskedLo = LoReg;
3126 } else {
3127 // Extract the mask subregister and apply the and.
3128 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3129 MaskedLo = MRI->createVirtualRegister(&RegRC);
3130
3131 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3132 .addReg(MaskReg, 0, AMDGPU::sub0);
3133 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3134 .addReg(LoReg)
3135 .addReg(MaskLo);
3136 }
3137
3138 if (CanCopyHi32) {
3139 // If all the bits in the high half are 1, we only need a copy for it.
3140 MaskedHi = HiReg;
3141 } else {
3142 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3143 MaskedHi = MRI->createVirtualRegister(&RegRC);
3144
3145 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3146 .addReg(MaskReg, 0, AMDGPU::sub1);
3147 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3148 .addReg(HiReg)
3149 .addReg(MaskHi);
3150 }
3151
3152 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3153 .addReg(MaskedLo)
3154 .addImm(AMDGPU::sub0)
3155 .addReg(MaskedHi)
3156 .addImm(AMDGPU::sub1);
3157 I.eraseFromParent();
3158 return true;
3159}
3160
3161/// Return the register to use for the index value, and the subregister to use
3162/// for the indirectly accessed register.
3163static std::pair<Register, unsigned>
3165 const TargetRegisterClass *SuperRC, Register IdxReg,
3166 unsigned EltSize, GISelKnownBits &KnownBits) {
3167 Register IdxBaseReg;
3168 int Offset;
3169
3170 std::tie(IdxBaseReg, Offset) =
3172 if (IdxBaseReg == AMDGPU::NoRegister) {
3173 // This will happen if the index is a known constant. This should ordinarily
3174 // be legalized out, but handle it as a register just in case.
3175 assert(Offset == 0);
3176 IdxBaseReg = IdxReg;
3177 }
3178
3179 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3180
3181 // Skip out of bounds offsets, or else we would end up using an undefined
3182 // register.
3183 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3184 return std::pair(IdxReg, SubRegs[0]);
3185 return std::pair(IdxBaseReg, SubRegs[Offset]);
3186}
3187
3188bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3189 MachineInstr &MI) const {
3190 Register DstReg = MI.getOperand(0).getReg();
3191 Register SrcReg = MI.getOperand(1).getReg();
3192 Register IdxReg = MI.getOperand(2).getReg();
3193
3194 LLT DstTy = MRI->getType(DstReg);
3195 LLT SrcTy = MRI->getType(SrcReg);
3196
3197 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3198 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3199 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3200
3201 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3202 // into a waterfall loop.
3203 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3204 return false;
3205
3206 const TargetRegisterClass *SrcRC =
3207 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3208 const TargetRegisterClass *DstRC =
3209 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3210 if (!SrcRC || !DstRC)
3211 return false;
3212 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3213 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3214 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3215 return false;
3216
3217 MachineBasicBlock *BB = MI.getParent();
3218 const DebugLoc &DL = MI.getDebugLoc();
3219 const bool Is64 = DstTy.getSizeInBits() == 64;
3220
3221 unsigned SubReg;
3222 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3223 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3224
3225 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3226 if (DstTy.getSizeInBits() != 32 && !Is64)
3227 return false;
3228
3229 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3230 .addReg(IdxReg);
3231
3232 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3233 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3234 .addReg(SrcReg, 0, SubReg)
3235 .addReg(SrcReg, RegState::Implicit);
3236 MI.eraseFromParent();
3237 return true;
3238 }
3239
3240 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3241 return false;
3242
3243 if (!STI.useVGPRIndexMode()) {
3244 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3245 .addReg(IdxReg);
3246 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3247 .addReg(SrcReg, 0, SubReg)
3248 .addReg(SrcReg, RegState::Implicit);
3249 MI.eraseFromParent();
3250 return true;
3251 }
3252
3253 const MCInstrDesc &GPRIDXDesc =
3254 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3255 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3256 .addReg(SrcReg)
3257 .addReg(IdxReg)
3258 .addImm(SubReg);
3259
3260 MI.eraseFromParent();
3261 return true;
3262}
3263
3264// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3265bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3266 MachineInstr &MI) const {
3267 Register DstReg = MI.getOperand(0).getReg();
3268 Register VecReg = MI.getOperand(1).getReg();
3269 Register ValReg = MI.getOperand(2).getReg();
3270 Register IdxReg = MI.getOperand(3).getReg();
3271
3272 LLT VecTy = MRI->getType(DstReg);
3273 LLT ValTy = MRI->getType(ValReg);
3274 unsigned VecSize = VecTy.getSizeInBits();
3275 unsigned ValSize = ValTy.getSizeInBits();
3276
3277 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3278 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3279 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3280
3281 assert(VecTy.getElementType() == ValTy);
3282
3283 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3284 // into a waterfall loop.
3285 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3286 return false;
3287
3288 const TargetRegisterClass *VecRC =
3289 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3290 const TargetRegisterClass *ValRC =
3291 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3292
3293 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3294 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3295 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3296 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3297 return false;
3298
3299 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3300 return false;
3301
3302 unsigned SubReg;
3303 std::tie(IdxReg, SubReg) =
3304 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3305
3306 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3307 STI.useVGPRIndexMode();
3308
3309 MachineBasicBlock *BB = MI.getParent();
3310 const DebugLoc &DL = MI.getDebugLoc();
3311
3312 if (!IndexMode) {
3313 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3314 .addReg(IdxReg);
3315
3316 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3317 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3318 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3319 .addReg(VecReg)
3320 .addReg(ValReg)
3321 .addImm(SubReg);
3322 MI.eraseFromParent();
3323 return true;
3324 }
3325
3326 const MCInstrDesc &GPRIDXDesc =
3327 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3328 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3329 .addReg(VecReg)
3330 .addReg(ValReg)
3331 .addReg(IdxReg)
3332 .addImm(SubReg);
3333
3334 MI.eraseFromParent();
3335 return true;
3336}
3337
3338bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3340 unsigned Opc;
3341 unsigned Size = MI.getOperand(3).getImm();
3342
3343 // The struct intrinsic variants add one additional operand over raw.
3344 const bool HasVIndex = MI.getNumOperands() == 9;
3345 Register VIndex;
3346 int OpOffset = 0;
3347 if (HasVIndex) {
3348 VIndex = MI.getOperand(4).getReg();
3349 OpOffset = 1;
3350 }
3351
3352 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3353 std::optional<ValueAndVReg> MaybeVOffset =
3355 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3356
3357 switch (Size) {
3358 default:
3359 return false;
3360 case 1:
3361 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3362 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3363 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3364 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3365 break;
3366 case 2:
3367 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3368 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3369 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3370 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3371 break;
3372 case 4:
3373 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3374 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3375 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3376 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3377 break;
3378 case 12:
3379 if (!Subtarget->hasLDSLoadB96_B128())
3380 return false;
3381
3382 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3383 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3384 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3385 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3386 break;
3387 case 16:
3388 if (!Subtarget->hasLDSLoadB96_B128())
3389 return false;
3390
3391 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3392 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3393 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3394 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3395 break;
3396 }
3397
3398 MachineBasicBlock *MBB = MI.getParent();
3399 const DebugLoc &DL = MI.getDebugLoc();
3400 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3401 .add(MI.getOperand(2));
3402
3403 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3404
3405 if (HasVIndex && HasVOffset) {
3406 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3407 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3408 .addReg(VIndex)
3409 .addImm(AMDGPU::sub0)
3410 .addReg(VOffset)
3411 .addImm(AMDGPU::sub1);
3412
3413 MIB.addReg(IdxReg);
3414 } else if (HasVIndex) {
3415 MIB.addReg(VIndex);
3416 } else if (HasVOffset) {
3417 MIB.addReg(VOffset);
3418 }
3419
3420 MIB.add(MI.getOperand(1)); // rsrc
3421 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3422 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3423 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3424 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3425 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3426 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3427 MIB.addImm(
3428 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3429 ? 1
3430 : 0); // swz
3431
3432 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3433 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3434 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3435 MachinePointerInfo StorePtrI = LoadPtrI;
3436 StorePtrI.V = nullptr;
3438
3439 auto F = LoadMMO->getFlags() &
3441 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3442 Size, LoadMMO->getBaseAlign());
3443
3444 MachineMemOperand *StoreMMO =
3446 sizeof(int32_t), LoadMMO->getBaseAlign());
3447
3448 MIB.setMemRefs({LoadMMO, StoreMMO});
3449
3450 MI.eraseFromParent();
3451 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3452}
3453
3454/// Match a zero extend from a 32-bit value to 64-bits.
3456 Register ZExtSrc;
3457 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3458 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3459
3460 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3461 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3462 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3463 return Register();
3464
3465 assert(Def->getNumOperands() == 3 &&
3466 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3467 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3468 return Def->getOperand(1).getReg();
3469 }
3470
3471 return Register();
3472}
3473
3474bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3475 unsigned Opc;
3476 unsigned Size = MI.getOperand(3).getImm();
3477
3478 switch (Size) {
3479 default:
3480 return false;
3481 case 1:
3482 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3483 break;
3484 case 2:
3485 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3486 break;
3487 case 4:
3488 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3489 break;
3490 case 12:
3491 if (!Subtarget->hasLDSLoadB96_B128())
3492 return false;
3493 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3494 break;
3495 case 16:
3496 if (!Subtarget->hasLDSLoadB96_B128())
3497 return false;
3498 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3499 break;
3500 }
3501
3502 MachineBasicBlock *MBB = MI.getParent();
3503 const DebugLoc &DL = MI.getDebugLoc();
3504 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3505 .add(MI.getOperand(2));
3506
3507 Register Addr = MI.getOperand(1).getReg();
3508 Register VOffset;
3509 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3510 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3511 if (!isSGPR(Addr)) {
3512 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3513 if (isSGPR(AddrDef->Reg)) {
3514 Addr = AddrDef->Reg;
3515 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3516 Register SAddr =
3517 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3518 if (isSGPR(SAddr)) {
3519 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3520 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3521 Addr = SAddr;
3522 VOffset = Off;
3523 }
3524 }
3525 }
3526 }
3527
3528 if (isSGPR(Addr)) {
3529 Opc = AMDGPU::getGlobalSaddrOp(Opc);
3530 if (!VOffset) {
3531 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3532 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3533 .addImm(0);
3534 }
3535 }
3536
3537 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3538 .addReg(Addr);
3539
3540 if (isSGPR(Addr))
3541 MIB.addReg(VOffset);
3542
3543 MIB.add(MI.getOperand(4)) // offset
3544 .add(MI.getOperand(5)); // cpol
3545
3546 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3547 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3548 LoadPtrI.Offset = MI.getOperand(4).getImm();
3549 MachinePointerInfo StorePtrI = LoadPtrI;
3552 auto F = LoadMMO->getFlags() &
3554 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3555 Size, LoadMMO->getBaseAlign());
3556 MachineMemOperand *StoreMMO =
3558 sizeof(int32_t), Align(4));
3559
3560 MIB.setMemRefs({LoadMMO, StoreMMO});
3561
3562 MI.eraseFromParent();
3563 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3564}
3565
3566bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3567 MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3568 MI.removeOperand(1);
3569 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3570 return true;
3571}
3572
3573// FIXME: This should be removed and let the patterns select. We just need the
3574// AGPR/VGPR combination versions.
3575bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3576 unsigned Opc;
3577 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3578 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3579 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3580 break;
3581 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3582 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3583 break;
3584 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3585 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3586 break;
3587 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3588 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3589 break;
3590 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3591 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3592 break;
3593 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3594 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3595 break;
3596 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3597 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3598 break;
3599 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3600 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3601 break;
3602 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3603 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3604 break;
3605 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3606 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3607 break;
3608 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3609 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3610 break;
3611 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3612 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3613 break;
3614 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3615 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3616 break;
3617 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3618 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3619 break;
3620 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3621 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3622 break;
3623 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3624 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3625 break;
3626 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3627 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3628 break;
3629 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3630 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3631 break;
3632 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3633 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3634 break;
3635 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3636 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3637 break;
3638 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3639 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3640 break;
3641 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3642 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3643 break;
3644 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3645 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3646 break;
3647 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3648 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3649 break;
3650 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3651 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3652 break;
3653 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3654 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3655 break;
3656 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3657 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3658 break;
3659 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3660 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3661 break;
3662 default:
3663 llvm_unreachable("unhandled smfmac intrinsic");
3664 }
3665
3666 auto VDst_In = MI.getOperand(4);
3667
3668 MI.setDesc(TII.get(Opc));
3669 MI.removeOperand(4); // VDst_In
3670 MI.removeOperand(1); // Intrinsic ID
3671 MI.addOperand(VDst_In); // Readd VDst_In to the end
3672 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3673 return true;
3674}
3675
3676bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3677 MachineInstr &MI, Intrinsic::ID IntrID) const {
3678 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3679 !Subtarget->hasPermlane16Swap())
3680 return false;
3681 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3682 !Subtarget->hasPermlane32Swap())
3683 return false;
3684
3685 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3686 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3687 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3688
3689 MI.removeOperand(2);
3690 MI.setDesc(TII.get(Opcode));
3691 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3692
3693 MachineOperand &FI = MI.getOperand(4);
3695
3696 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3697}
3698
3699bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3700 Register DstReg = MI.getOperand(0).getReg();
3701 Register SrcReg = MI.getOperand(1).getReg();
3702 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3703 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3704 MachineBasicBlock *MBB = MI.getParent();
3705 const DebugLoc &DL = MI.getDebugLoc();
3706
3707 if (IsVALU) {
3708 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3709 .addImm(Subtarget->getWavefrontSizeLog2())
3710 .addReg(SrcReg);
3711 } else {
3712 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3713 .addReg(SrcReg)
3714 .addImm(Subtarget->getWavefrontSizeLog2())
3715 .setOperandDead(3); // Dead scc
3716 }
3717
3718 const TargetRegisterClass &RC =
3719 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3720 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3721 return false;
3722
3723 MI.eraseFromParent();
3724 return true;
3725}
3726
3727// Match BITOP3 operation and return a number of matched instructions plus
3728// truth table.
3729static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
3731 const MachineRegisterInfo &MRI) {
3732 unsigned NumOpcodes = 0;
3733 uint8_t LHSBits, RHSBits;
3734
3735 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
3736 // Define truth table given Src0, Src1, Src2 bits permutations:
3737 // 0 0 0
3738 // 0 0 1
3739 // 0 1 0
3740 // 0 1 1
3741 // 1 0 0
3742 // 1 0 1
3743 // 1 1 0
3744 // 1 1 1
3745 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3746
3747 if (mi_match(Op, MRI, m_AllOnesInt())) {
3748 Bits = 0xff;
3749 return true;
3750 }
3751 if (mi_match(Op, MRI, m_ZeroInt())) {
3752 Bits = 0;
3753 return true;
3754 }
3755
3756 for (unsigned I = 0; I < Src.size(); ++I) {
3757 // Try to find existing reused operand
3758 if (Src[I] == Op) {
3759 Bits = SrcBits[I];
3760 return true;
3761 }
3762 // Try to replace parent operator
3763 if (Src[I] == R) {
3764 Bits = SrcBits[I];
3765 Src[I] = Op;
3766 return true;
3767 }
3768 }
3769
3770 if (Src.size() == 3) {
3771 // No room left for operands. Try one last time, there can be a 'not' of
3772 // one of our source operands. In this case we can compute the bits
3773 // without growing Src vector.
3774 Register LHS;
3775 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
3777 for (unsigned I = 0; I < Src.size(); ++I) {
3778 if (Src[I] == LHS) {
3779 Bits = ~SrcBits[I];
3780 return true;
3781 }
3782 }
3783 }
3784
3785 return false;
3786 }
3787
3788 Bits = SrcBits[Src.size()];
3789 Src.push_back(Op);
3790 return true;
3791 };
3792
3793 MachineInstr *MI = MRI.getVRegDef(R);
3794 switch (MI->getOpcode()) {
3795 case TargetOpcode::G_AND:
3796 case TargetOpcode::G_OR:
3797 case TargetOpcode::G_XOR: {
3798 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
3799 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
3800
3801 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
3802 if (!getOperandBits(LHS, LHSBits) ||
3803 !getOperandBits(RHS, RHSBits)) {
3804 Src = Backup;
3805 return std::make_pair(0, 0);
3806 }
3807
3808 // Recursion is naturally limited by the size of the operand vector.
3809 auto Op = BitOp3_Op(LHS, Src, MRI);
3810 if (Op.first) {
3811 NumOpcodes += Op.first;
3812 LHSBits = Op.second;
3813 }
3814
3815 Op = BitOp3_Op(RHS, Src, MRI);
3816 if (Op.first) {
3817 NumOpcodes += Op.first;
3818 RHSBits = Op.second;
3819 }
3820 break;
3821 }
3822 default:
3823 return std::make_pair(0, 0);
3824 }
3825
3826 uint8_t TTbl;
3827 switch (MI->getOpcode()) {
3828 case TargetOpcode::G_AND:
3829 TTbl = LHSBits & RHSBits;
3830 break;
3831 case TargetOpcode::G_OR:
3832 TTbl = LHSBits | RHSBits;
3833 break;
3834 case TargetOpcode::G_XOR:
3835 TTbl = LHSBits ^ RHSBits;
3836 break;
3837 default:
3838 break;
3839 }
3840
3841 return std::make_pair(NumOpcodes + 1, TTbl);
3842}
3843
3844bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
3845 if (!Subtarget->hasBitOp3Insts())
3846 return false;
3847
3848 Register DstReg = MI.getOperand(0).getReg();
3849 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3850 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3851 if (!IsVALU)
3852 return false;
3853
3855 uint8_t TTbl;
3856 unsigned NumOpcodes;
3857
3858 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
3859
3860 // Src.empty() case can happen if all operands are all zero or all ones.
3861 // Normally it shall be optimized out before reaching this.
3862 if (NumOpcodes < 2 || Src.empty())
3863 return false;
3864
3865 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
3866 if (NumOpcodes == 2 && IsB32) {
3867 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3868 // asm more readable. This cannot be modeled with AddedComplexity because
3869 // selector does not know how many operations did we match.
3870 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
3871 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
3872 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
3873 return false;
3874 } else if (NumOpcodes < 4) {
3875 // For a uniform case threshold should be higher to account for moves
3876 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
3877 // in SGPRs and a readtfirstlane after.
3878 return false;
3879 }
3880
3881 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
3882 unsigned CBL = STI.getConstantBusLimit(Opc);
3883 MachineBasicBlock *MBB = MI.getParent();
3884 const DebugLoc &DL = MI.getDebugLoc();
3885
3886 for (unsigned I = 0; I < Src.size(); ++I) {
3887 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
3888 if (RB->getID() != AMDGPU::SGPRRegBankID)
3889 continue;
3890 if (CBL > 0) {
3891 --CBL;
3892 continue;
3893 }
3894 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3895 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
3896 .addReg(Src[I]);
3897 Src[I] = NewReg;
3898 }
3899
3900 // Last operand can be ignored, turning a ternary operation into a binary.
3901 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3902 // 'c' with 'a' here without changing the answer. In some pathological
3903 // cases it should be possible to get an operation with a single operand
3904 // too if optimizer would not catch it.
3905 while (Src.size() < 3)
3906 Src.push_back(Src[0]);
3907
3908 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
3909 if (!IsB32)
3910 MIB.addImm(0); // src_mod0
3911 MIB.addReg(Src[0]);
3912 if (!IsB32)
3913 MIB.addImm(0); // src_mod1
3914 MIB.addReg(Src[1]);
3915 if (!IsB32)
3916 MIB.addImm(0); // src_mod2
3917 MIB.addReg(Src[2])
3918 .addImm(TTbl);
3919 if (!IsB32)
3920 MIB.addImm(0); // op_sel
3921
3922 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3923 MI.eraseFromParent();
3924
3925 return true;
3926}
3927
3928bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3929 Register SrcReg = MI.getOperand(0).getReg();
3930 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3931 return false;
3932
3933 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3934 Register SP =
3936 Register WaveAddr = getWaveAddress(DefMI);
3937 MachineBasicBlock *MBB = MI.getParent();
3938 const DebugLoc &DL = MI.getDebugLoc();
3939
3940 if (!WaveAddr) {
3941 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3942 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3943 .addReg(SrcReg)
3944 .addImm(Subtarget->getWavefrontSizeLog2())
3945 .setOperandDead(3); // Dead scc
3946 }
3947
3948 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3949 .addReg(WaveAddr);
3950
3951 MI.eraseFromParent();
3952 return true;
3953}
3954
3956
3957 if (!I.isPreISelOpcode()) {
3958 if (I.isCopy())
3959 return selectCOPY(I);
3960 return true;
3961 }
3962
3963 switch (I.getOpcode()) {
3964 case TargetOpcode::G_AND:
3965 case TargetOpcode::G_OR:
3966 case TargetOpcode::G_XOR:
3967 if (selectBITOP3(I))
3968 return true;
3969 if (selectImpl(I, *CoverageInfo))
3970 return true;
3971 return selectG_AND_OR_XOR(I);
3972 case TargetOpcode::G_ADD:
3973 case TargetOpcode::G_SUB:
3974 case TargetOpcode::G_PTR_ADD:
3975 if (selectImpl(I, *CoverageInfo))
3976 return true;
3977 return selectG_ADD_SUB(I);
3978 case TargetOpcode::G_UADDO:
3979 case TargetOpcode::G_USUBO:
3980 case TargetOpcode::G_UADDE:
3981 case TargetOpcode::G_USUBE:
3982 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3983 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3984 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3985 return selectG_AMDGPU_MAD_64_32(I);
3986 case TargetOpcode::G_INTTOPTR:
3987 case TargetOpcode::G_BITCAST:
3988 case TargetOpcode::G_PTRTOINT:
3989 case TargetOpcode::G_FREEZE:
3990 return selectCOPY(I);
3991 case TargetOpcode::G_FNEG:
3992 if (selectImpl(I, *CoverageInfo))
3993 return true;
3994 return selectG_FNEG(I);
3995 case TargetOpcode::G_FABS:
3996 if (selectImpl(I, *CoverageInfo))
3997 return true;
3998 return selectG_FABS(I);
3999 case TargetOpcode::G_EXTRACT:
4000 return selectG_EXTRACT(I);
4001 case TargetOpcode::G_MERGE_VALUES:
4002 case TargetOpcode::G_CONCAT_VECTORS:
4003 return selectG_MERGE_VALUES(I);
4004 case TargetOpcode::G_UNMERGE_VALUES:
4005 return selectG_UNMERGE_VALUES(I);
4006 case TargetOpcode::G_BUILD_VECTOR:
4007 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4008 return selectG_BUILD_VECTOR(I);
4009 case TargetOpcode::G_IMPLICIT_DEF:
4010 return selectG_IMPLICIT_DEF(I);
4011 case TargetOpcode::G_INSERT:
4012 return selectG_INSERT(I);
4013 case TargetOpcode::G_INTRINSIC:
4014 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4015 return selectG_INTRINSIC(I);
4016 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4017 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4018 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4019 case TargetOpcode::G_ICMP:
4020 case TargetOpcode::G_FCMP:
4021 if (selectG_ICMP_or_FCMP(I))
4022 return true;
4023 return selectImpl(I, *CoverageInfo);
4024 case TargetOpcode::G_LOAD:
4025 case TargetOpcode::G_ZEXTLOAD:
4026 case TargetOpcode::G_SEXTLOAD:
4027 case TargetOpcode::G_STORE:
4028 case TargetOpcode::G_ATOMIC_CMPXCHG:
4029 case TargetOpcode::G_ATOMICRMW_XCHG:
4030 case TargetOpcode::G_ATOMICRMW_ADD:
4031 case TargetOpcode::G_ATOMICRMW_SUB:
4032 case TargetOpcode::G_ATOMICRMW_AND:
4033 case TargetOpcode::G_ATOMICRMW_OR:
4034 case TargetOpcode::G_ATOMICRMW_XOR:
4035 case TargetOpcode::G_ATOMICRMW_MIN:
4036 case TargetOpcode::G_ATOMICRMW_MAX:
4037 case TargetOpcode::G_ATOMICRMW_UMIN:
4038 case TargetOpcode::G_ATOMICRMW_UMAX:
4039 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4040 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4041 case TargetOpcode::G_ATOMICRMW_FADD:
4042 case TargetOpcode::G_ATOMICRMW_FMIN:
4043 case TargetOpcode::G_ATOMICRMW_FMAX:
4044 return selectG_LOAD_STORE_ATOMICRMW(I);
4045 case TargetOpcode::G_SELECT:
4046 return selectG_SELECT(I);
4047 case TargetOpcode::G_TRUNC:
4048 return selectG_TRUNC(I);
4049 case TargetOpcode::G_SEXT:
4050 case TargetOpcode::G_ZEXT:
4051 case TargetOpcode::G_ANYEXT:
4052 case TargetOpcode::G_SEXT_INREG:
4053 // This is a workaround. For extension from type i1, `selectImpl()` uses
4054 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4055 // i1 can only be hold in a SGPR class.
4056 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4057 selectImpl(I, *CoverageInfo))
4058 return true;
4059 return selectG_SZA_EXT(I);
4060 case TargetOpcode::G_FPEXT:
4061 if (selectG_FPEXT(I))
4062 return true;
4063 return selectImpl(I, *CoverageInfo);
4064 case TargetOpcode::G_BRCOND:
4065 return selectG_BRCOND(I);
4066 case TargetOpcode::G_GLOBAL_VALUE:
4067 return selectG_GLOBAL_VALUE(I);
4068 case TargetOpcode::G_PTRMASK:
4069 return selectG_PTRMASK(I);
4070 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4071 return selectG_EXTRACT_VECTOR_ELT(I);
4072 case TargetOpcode::G_INSERT_VECTOR_ELT:
4073 return selectG_INSERT_VECTOR_ELT(I);
4074 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4075 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4076 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4077 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4078 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4081 assert(Intr && "not an image intrinsic with image pseudo");
4082 return selectImageIntrinsic(I, Intr);
4083 }
4084 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
4085 return selectBVHIntrinsic(I);
4086 case AMDGPU::G_SBFX:
4087 case AMDGPU::G_UBFX:
4088 return selectG_SBFX_UBFX(I);
4089 case AMDGPU::G_SI_CALL:
4090 I.setDesc(TII.get(AMDGPU::SI_CALL));
4091 return true;
4092 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4093 return selectWaveAddress(I);
4094 case AMDGPU::G_STACKRESTORE:
4095 return selectStackRestore(I);
4096 case AMDGPU::G_PHI:
4097 return selectPHI(I);
4098 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4099 return selectCOPY_SCC_VCC(I);
4100 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4101 return selectCOPY_VCC_SCC(I);
4102 case AMDGPU::G_AMDGPU_READANYLANE:
4103 return selectReadAnyLane(I);
4104 case TargetOpcode::G_CONSTANT:
4105 case TargetOpcode::G_FCONSTANT:
4106 default:
4107 return selectImpl(I, *CoverageInfo);
4108 }
4109 return false;
4110}
4111
4113AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4114 return {{
4115 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4116 }};
4117
4118}
4119
4120std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4121 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4122 unsigned Mods = 0;
4123 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4124
4125 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4126 Src = MI->getOperand(1).getReg();
4127 Mods |= SISrcMods::NEG;
4128 MI = getDefIgnoringCopies(Src, *MRI);
4129 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4130 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4131 // denormal mode, but we're implicitly canonicalizing in a source operand.
4132 const ConstantFP *LHS =
4133 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4134 if (LHS && LHS->isZero()) {
4135 Mods |= SISrcMods::NEG;
4136 Src = MI->getOperand(2).getReg();
4137 }
4138 }
4139
4140 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4141 Src = MI->getOperand(1).getReg();
4142 Mods |= SISrcMods::ABS;
4143 }
4144
4145 if (OpSel)
4146 Mods |= SISrcMods::OP_SEL_0;
4147
4148 return std::pair(Src, Mods);
4149}
4150
4151Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4152 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4153 bool ForceVGPR) const {
4154 if ((Mods != 0 || ForceVGPR) &&
4155 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4156
4157 // If we looked through copies to find source modifiers on an SGPR operand,
4158 // we now have an SGPR register source. To avoid potentially violating the
4159 // constant bus restriction, we need to insert a copy to a VGPR.
4160 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4161 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4162 TII.get(AMDGPU::COPY), VGPRSrc)
4163 .addReg(Src);
4164 Src = VGPRSrc;
4165 }
4166
4167 return Src;
4168}
4169
4170///
4171/// This will select either an SGPR or VGPR operand and will save us from
4172/// having to write an extra tablegen pattern.
4174AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4175 return {{
4176 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4177 }};
4178}
4179
4181AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4182 Register Src;
4183 unsigned Mods;
4184 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4185
4186 return {{
4187 [=](MachineInstrBuilder &MIB) {
4188 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4189 },
4190 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4191 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4192 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4193 }};
4194}
4195
4197AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4198 Register Src;
4199 unsigned Mods;
4200 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4201 /*IsCanonicalizing=*/true,
4202 /*AllowAbs=*/false);
4203
4204 return {{
4205 [=](MachineInstrBuilder &MIB) {
4206 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4207 },
4208 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4209 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4210 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4211 }};
4212}
4213
4215AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4216 return {{
4217 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4218 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4219 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4220 }};
4221}
4222
4224AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4225 Register Src;
4226 unsigned Mods;
4227 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4228
4229 return {{
4230 [=](MachineInstrBuilder &MIB) {
4231 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4232 },
4233 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4234 }};
4235}
4236
4238AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4239 MachineOperand &Root) const {
4240 Register Src;
4241 unsigned Mods;
4242 std::tie(Src, Mods) =
4243 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4244
4245 return {{
4246 [=](MachineInstrBuilder &MIB) {
4247 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4248 },
4249 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4250 }};
4251}
4252
4254AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4255 Register Src;
4256 unsigned Mods;
4257 std::tie(Src, Mods) =
4258 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4259 /*AllowAbs=*/false);
4260
4261 return {{
4262 [=](MachineInstrBuilder &MIB) {
4263 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4264 },
4265 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4266 }};
4267}
4268
4270AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4271 Register Reg = Root.getReg();
4272 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4273 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4274 return {};
4275 return {{
4276 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4277 }};
4278}
4279
4280std::pair<Register, unsigned>
4281AMDGPUInstructionSelector::selectVOP3PModsImpl(
4282 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
4283 unsigned Mods = 0;
4284 MachineInstr *MI = MRI.getVRegDef(Src);
4285
4286 if (MI->getOpcode() == AMDGPU::G_FNEG &&
4287 // It's possible to see an f32 fneg here, but unlikely.
4288 // TODO: Treat f32 fneg as only high bit.
4289 MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
4291 Src = MI->getOperand(1).getReg();
4292 MI = MRI.getVRegDef(Src);
4293 }
4294
4295 // TODO: Handle G_FSUB 0 as fneg
4296
4297 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
4298 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
4299
4300 // Packed instructions do not have abs modifiers.
4301 Mods |= SISrcMods::OP_SEL_1;
4302
4303 return std::pair(Src, Mods);
4304}
4305
4307AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
4309 = Root.getParent()->getParent()->getParent()->getRegInfo();
4310
4311 Register Src;
4312 unsigned Mods;
4313 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
4314
4315 return {{
4316 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4317 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4318 }};
4319}
4320
4322AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
4324 = Root.getParent()->getParent()->getParent()->getRegInfo();
4325
4326 Register Src;
4327 unsigned Mods;
4328 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
4329
4330 return {{
4331 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4332 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4333 }};
4334}
4335
4337AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
4338 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
4339 // Value is in Imm operand as i1 sign extended to int64_t.
4340 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
4341 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4342 "expected i1 value");
4343 unsigned Mods = SISrcMods::OP_SEL_1;
4344 if (Root.getImm() == -1)
4345 Mods ^= SISrcMods::NEG;
4346 return {{
4347 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4348 }};
4349}
4350
4352AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
4353 MachineOperand &Root) const {
4354 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4355 "expected i1 value");
4356 unsigned Mods = SISrcMods::OP_SEL_1;
4357 if (Root.getImm() != 0)
4358 Mods |= SISrcMods::OP_SEL_0;
4359
4360 return {{
4361 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4362 }};
4363}
4364
4366 MachineInstr *InsertPt,
4368 const TargetRegisterClass *DstRegClass;
4369 switch (Elts.size()) {
4370 case 8:
4371 DstRegClass = &AMDGPU::VReg_256RegClass;
4372 break;
4373 case 4:
4374 DstRegClass = &AMDGPU::VReg_128RegClass;
4375 break;
4376 case 2:
4377 DstRegClass = &AMDGPU::VReg_64RegClass;
4378 break;
4379 default:
4380 llvm_unreachable("unhandled Reg sequence size");
4381 }
4382
4383 MachineIRBuilder B(*InsertPt);
4384 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
4385 .addDef(MRI.createVirtualRegister(DstRegClass));
4386 for (unsigned i = 0; i < Elts.size(); ++i) {
4387 MIB.addReg(Elts[i]);
4389 }
4390 return MIB->getOperand(0).getReg();
4391}
4392
4393static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
4395 MachineInstr *InsertPt,
4397 if (ModOpcode == TargetOpcode::G_FNEG) {
4398 Mods |= SISrcMods::NEG;
4399 // Check if all elements also have abs modifier
4400 SmallVector<Register, 8> NegAbsElts;
4401 for (auto El : Elts) {
4402 Register FabsSrc;
4403 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
4404 break;
4405 NegAbsElts.push_back(FabsSrc);
4406 }
4407 if (Elts.size() != NegAbsElts.size()) {
4408 // Neg
4409 Src = buildRegSequence(Elts, InsertPt, MRI);
4410 } else {
4411 // Neg and Abs
4412 Mods |= SISrcMods::NEG_HI;
4413 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
4414 }
4415 } else {
4416 assert(ModOpcode == TargetOpcode::G_FABS);
4417 // Abs
4418 Mods |= SISrcMods::NEG_HI;
4419 Src = buildRegSequence(Elts, InsertPt, MRI);
4420 }
4421}
4422
4424AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
4425 Register Src = Root.getReg();
4426 unsigned Mods = SISrcMods::OP_SEL_1;
4428
4429 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
4430 assert(BV->getNumSources() > 0);
4431 // Based on first element decide which mod we match, neg or abs
4432 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
4433 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
4434 ? AMDGPU::G_FNEG
4435 : AMDGPU::G_FABS;
4436 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
4437 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
4438 if (ElF32->getOpcode() != ModOpcode)
4439 break;
4440 EltsF32.push_back(ElF32->getOperand(1).getReg());
4441 }
4442
4443 // All elements had ModOpcode modifier
4444 if (BV->getNumSources() == EltsF32.size()) {
4445 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
4446 *MRI);
4447 }
4448 }
4449
4450 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4451 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4452}
4453
4455AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
4456 Register Src = Root.getReg();
4457 unsigned Mods = SISrcMods::OP_SEL_1;
4458 SmallVector<Register, 8> EltsV2F16;
4459
4460 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4461 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4462 Register FNegSrc;
4463 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
4464 break;
4465 EltsV2F16.push_back(FNegSrc);
4466 }
4467
4468 // All elements had ModOpcode modifier
4469 if (CV->getNumSources() == EltsV2F16.size()) {
4470 Mods |= SISrcMods::NEG;
4471 Mods |= SISrcMods::NEG_HI;
4472 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
4473 }
4474 }
4475
4476 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4477 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4478}
4479
4481AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
4482 Register Src = Root.getReg();
4483 unsigned Mods = SISrcMods::OP_SEL_1;
4484 SmallVector<Register, 8> EltsV2F16;
4485
4486 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4487 assert(CV->getNumSources() > 0);
4488 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
4489 // Based on first element decide which mod we match, neg or abs
4490 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
4491 ? AMDGPU::G_FNEG
4492 : AMDGPU::G_FABS;
4493
4494 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4495 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
4496 if (ElV2F16->getOpcode() != ModOpcode)
4497 break;
4498 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
4499 }
4500
4501 // All elements had ModOpcode modifier
4502 if (CV->getNumSources() == EltsV2F16.size()) {
4503 MachineIRBuilder B(*Root.getParent());
4504 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
4505 *MRI);
4506 }
4507 }
4508
4509 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4510 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4511}
4512
4514AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
4515 std::optional<FPValueAndVReg> FPValReg;
4516 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
4517 if (TII.isInlineConstant(FPValReg->Value)) {
4518 return {{[=](MachineInstrBuilder &MIB) {
4519 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4520 }}};
4521 }
4522 // Non-inlineable splat floats should not fall-through for integer immediate
4523 // checks.
4524 return {};
4525 }
4526
4527 APInt ICst;
4528 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
4529 if (TII.isInlineConstant(ICst)) {
4530 return {
4531 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
4532 }
4533 }
4534
4535 return {};
4536}
4537
4539AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4540 Register Src =
4541 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4542 unsigned Key = 0;
4543
4544 Register ShiftSrc;
4545 std::optional<ValueAndVReg> ShiftAmt;
4546 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4547 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4548 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4549 Key = ShiftAmt->Value.getZExtValue() / 8;
4550 Src = ShiftSrc;
4551 }
4552
4553 return {{
4554 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4555 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4556 }};
4557}
4558
4560AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4561
4562 Register Src =
4563 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4564 unsigned Key = 0;
4565
4566 Register ShiftSrc;
4567 std::optional<ValueAndVReg> ShiftAmt;
4568 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4569 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4570 ShiftAmt->Value.getZExtValue() == 16) {
4571 Src = ShiftSrc;
4572 Key = 1;
4573 }
4574
4575 return {{
4576 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4577 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4578 }};
4579}
4580
4582AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4583 Register Src;
4584 unsigned Mods;
4585 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4586
4587 // FIXME: Handle op_sel
4588 return {{
4589 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4590 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4591 }};
4592}
4593
4595AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4596 Register Src;
4597 unsigned Mods;
4598 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4599 /*IsCanonicalizing=*/true,
4600 /*AllowAbs=*/false,
4601 /*OpSel=*/false);
4602
4603 return {{
4604 [=](MachineInstrBuilder &MIB) {
4605 MIB.addReg(
4606 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4607 },
4608 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4609 }};
4610}
4611
4613AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4614 Register Src;
4615 unsigned Mods;
4616 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4617 /*IsCanonicalizing=*/true,
4618 /*AllowAbs=*/false,
4619 /*OpSel=*/true);
4620
4621 return {{
4622 [=](MachineInstrBuilder &MIB) {
4623 MIB.addReg(
4624 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4625 },
4626 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4627 }};
4628}
4629
4630bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4631 Register &Base,
4632 Register *SOffset,
4633 int64_t *Offset) const {
4634 MachineInstr *MI = Root.getParent();
4635 MachineBasicBlock *MBB = MI->getParent();
4636
4637 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4638 // then we can select all ptr + 32-bit offsets.
4639 SmallVector<GEPInfo, 4> AddrInfo;
4640 getAddrModeInfo(*MI, *MRI, AddrInfo);
4641
4642 if (AddrInfo.empty())
4643 return false;
4644
4645 const GEPInfo &GEPI = AddrInfo[0];
4646 std::optional<int64_t> EncodedImm;
4647
4648 if (SOffset && Offset) {
4649 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4650 /*HasSOffset=*/true);
4651 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4652 AddrInfo.size() > 1) {
4653 const GEPInfo &GEPI2 = AddrInfo[1];
4654 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4655 if (Register OffsetReg =
4656 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
4657 Base = GEPI2.SgprParts[0];
4658 *SOffset = OffsetReg;
4659 *Offset = *EncodedImm;
4660 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
4661 return true;
4662
4663 // For unbuffered smem loads, it is illegal for the Immediate Offset
4664 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
4665 // is negative. Handle the case where the Immediate Offset + SOffset
4666 // is negative.
4667 auto SKnown = KB->getKnownBits(*SOffset);
4668 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
4669 return false;
4670
4671 return true;
4672 }
4673 }
4674 }
4675 return false;
4676 }
4677
4678 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4679 /*HasSOffset=*/false);
4680 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4681 Base = GEPI.SgprParts[0];
4682 *Offset = *EncodedImm;
4683 return true;
4684 }
4685
4686 // SGPR offset is unsigned.
4687 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4688 GEPI.Imm != 0) {
4689 // If we make it this far we have a load with an 32-bit immediate offset.
4690 // It is OK to select this using a sgpr offset, because we have already
4691 // failed trying to select this load into one of the _IMM variants since
4692 // the _IMM Patterns are considered before the _SGPR patterns.
4693 Base = GEPI.SgprParts[0];
4694 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4695 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4696 .addImm(GEPI.Imm);
4697 return true;
4698 }
4699
4700 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4701 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4702 Base = GEPI.SgprParts[0];
4703 *SOffset = OffsetReg;
4704 return true;
4705 }
4706 }
4707
4708 return false;
4709}
4710
4712AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4713 Register Base;
4714 int64_t Offset;
4715 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
4716 return std::nullopt;
4717
4718 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4719 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4720}
4721
4723AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4724 SmallVector<GEPInfo, 4> AddrInfo;
4725 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4726
4727 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4728 return std::nullopt;
4729
4730 const GEPInfo &GEPInfo = AddrInfo[0];
4731 Register PtrReg = GEPInfo.SgprParts[0];
4732 std::optional<int64_t> EncodedImm =
4733 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4734 if (!EncodedImm)
4735 return std::nullopt;
4736
4737 return {{
4738 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4739 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4740 }};
4741}
4742
4744AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4745 Register Base, SOffset;
4746 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
4747 return std::nullopt;
4748
4749 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4750 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4751}
4752
4754AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4755 Register Base, SOffset;
4756 int64_t Offset;
4757 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
4758 return std::nullopt;
4759
4760 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4761 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4762 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4763}
4764
4765std::pair<Register, int>
4766AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4767 uint64_t FlatVariant) const {
4768 MachineInstr *MI = Root.getParent();
4769
4770 auto Default = std::pair(Root.getReg(), 0);
4771
4772 if (!STI.hasFlatInstOffsets())
4773 return Default;
4774
4775 Register PtrBase;
4776 int64_t ConstOffset;
4777 std::tie(PtrBase, ConstOffset) =
4778 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4779
4780 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4781 !isFlatScratchBaseLegal(Root.getReg())))
4782 return Default;
4783
4784 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4785 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4786 return Default;
4787
4788 return std::pair(PtrBase, ConstOffset);
4789}
4790
4792AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4793 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4794
4795 return {{
4796 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4797 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4798 }};
4799}
4800
4802AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4803 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
4804
4805 return {{
4806 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4807 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4808 }};
4809}
4810
4812AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4813 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
4814
4815 return {{
4816 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4817 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4818 }};
4819}
4820
4821// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4823AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4824 Register Addr = Root.getReg();
4825 Register PtrBase;
4826 int64_t ConstOffset;
4827 int64_t ImmOffset = 0;
4828
4829 // Match the immediate offset first, which canonically is moved as low as
4830 // possible.
4831 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4832
4833 if (ConstOffset != 0) {
4834 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
4836 Addr = PtrBase;
4837 ImmOffset = ConstOffset;
4838 } else {
4839 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4840 if (isSGPR(PtrBaseDef->Reg)) {
4841 if (ConstOffset > 0) {
4842 // Offset is too large.
4843 //
4844 // saddr + large_offset -> saddr +
4845 // (voffset = large_offset & ~MaxOffset) +
4846 // (large_offset & MaxOffset);
4847 int64_t SplitImmOffset, RemainderOffset;
4848 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4850
4851 if (isUInt<32>(RemainderOffset)) {
4852 MachineInstr *MI = Root.getParent();
4853 MachineBasicBlock *MBB = MI->getParent();
4854 Register HighBits =
4855 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4856
4857 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4858 HighBits)
4859 .addImm(RemainderOffset);
4860
4861 return {{
4862 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4863 [=](MachineInstrBuilder &MIB) {
4864 MIB.addReg(HighBits);
4865 }, // voffset
4866 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4867 }};
4868 }
4869 }
4870
4871 // We are adding a 64 bit SGPR and a constant. If constant bus limit
4872 // is 1 we would need to perform 1 or 2 extra moves for each half of
4873 // the constant and it is better to do a scalar add and then issue a
4874 // single VALU instruction to materialize zero. Otherwise it is less
4875 // instructions to perform VALU adds with immediates or inline literals.
4876 unsigned NumLiterals =
4877 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
4878 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
4879 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4880 return std::nullopt;
4881 }
4882 }
4883 }
4884
4885 // Match the variable offset.
4886 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4887 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4888 // Look through the SGPR->VGPR copy.
4889 Register SAddr =
4890 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4891
4892 if (isSGPR(SAddr)) {
4893 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4894
4895 // It's possible voffset is an SGPR here, but the copy to VGPR will be
4896 // inserted later.
4897 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4898 return {{[=](MachineInstrBuilder &MIB) { // saddr
4899 MIB.addReg(SAddr);
4900 },
4901 [=](MachineInstrBuilder &MIB) { // voffset
4902 MIB.addReg(VOffset);
4903 },
4904 [=](MachineInstrBuilder &MIB) { // offset
4905 MIB.addImm(ImmOffset);
4906 }}};
4907 }
4908 }
4909 }
4910
4911 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4912 // drop this.
4913 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4914 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4915 return std::nullopt;
4916
4917 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4918 // moves required to copy a 64-bit SGPR to VGPR.
4919 MachineInstr *MI = Root.getParent();
4920 MachineBasicBlock *MBB = MI->getParent();
4921 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4922
4923 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4924 .addImm(0);
4925
4926 return {{
4927 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4928 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
4929 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4930 }};
4931}
4932
4934AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4935 Register Addr = Root.getReg();
4936 Register PtrBase;
4937 int64_t ConstOffset;
4938 int64_t ImmOffset = 0;
4939
4940 // Match the immediate offset first, which canonically is moved as low as
4941 // possible.
4942 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4943
4944 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4947 Addr = PtrBase;
4948 ImmOffset = ConstOffset;
4949 }
4950
4951 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4952 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4953 int FI = AddrDef->MI->getOperand(1).getIndex();
4954 return {{
4955 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4956 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4957 }};
4958 }
4959
4960 Register SAddr = AddrDef->Reg;
4961
4962 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4963 Register LHS = AddrDef->MI->getOperand(1).getReg();
4964 Register RHS = AddrDef->MI->getOperand(2).getReg();
4965 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4966 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4967
4968 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4969 isSGPR(RHSDef->Reg)) {
4970 int FI = LHSDef->MI->getOperand(1).getIndex();
4971 MachineInstr &I = *Root.getParent();
4972 MachineBasicBlock *BB = I.getParent();
4973 const DebugLoc &DL = I.getDebugLoc();
4974 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4975
4976 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4977 .addFrameIndex(FI)
4978 .addReg(RHSDef->Reg)
4979 .setOperandDead(3); // Dead scc
4980 }
4981 }
4982
4983 if (!isSGPR(SAddr))
4984 return std::nullopt;
4985
4986 return {{
4987 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4988 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4989 }};
4990}
4991
4992// Check whether the flat scratch SVS swizzle bug affects this access.
4993bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4994 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4995 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4996 return false;
4997
4998 // The bug affects the swizzling of SVS accesses if there is any carry out
4999 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
5000 // voffset to (soffset + inst_offset).
5001 auto VKnown = KB->getKnownBits(VAddr);
5002 auto SKnown = KnownBits::add(KB->getKnownBits(SAddr),
5003 KnownBits::makeConstant(APInt(32, ImmOffset)));
5004 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
5005 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
5006 return (VMax & 3) + (SMax & 3) >= 4;
5007}
5008
5010AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
5011 Register Addr = Root.getReg();
5012 Register PtrBase;
5013 int64_t ConstOffset;
5014 int64_t ImmOffset = 0;
5015
5016 // Match the immediate offset first, which canonically is moved as low as
5017 // possible.
5018 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
5019
5020 Register OrigAddr = Addr;
5021 if (ConstOffset != 0 &&
5022 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
5023 Addr = PtrBase;
5024 ImmOffset = ConstOffset;
5025 }
5026
5027 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5028 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
5029 return std::nullopt;
5030
5031 Register RHS = AddrDef->MI->getOperand(2).getReg();
5032 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
5033 return std::nullopt;
5034
5035 Register LHS = AddrDef->MI->getOperand(1).getReg();
5036 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5037
5038 if (OrigAddr != Addr) {
5039 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
5040 return std::nullopt;
5041 } else {
5042 if (!isFlatScratchBaseLegalSV(OrigAddr))
5043 return std::nullopt;
5044 }
5045
5046 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
5047 return std::nullopt;
5048
5049 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5050 int FI = LHSDef->MI->getOperand(1).getIndex();
5051 return {{
5052 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5053 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5054 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5055 }};
5056 }
5057
5058 if (!isSGPR(LHS))
5059 return std::nullopt;
5060
5061 return {{
5062 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5063 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
5064 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5065 }};
5066}
5067
5069AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
5070 MachineInstr *MI = Root.getParent();
5071 MachineBasicBlock *MBB = MI->getParent();
5074
5075 int64_t Offset = 0;
5076 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
5078 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5079
5080 // TODO: Should this be inside the render function? The iterator seems to
5081 // move.
5082 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
5083 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5084 HighBits)
5085 .addImm(Offset & ~MaxOffset);
5086
5087 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5088 MIB.addReg(Info->getScratchRSrcReg());
5089 },
5090 [=](MachineInstrBuilder &MIB) { // vaddr
5091 MIB.addReg(HighBits);
5092 },
5093 [=](MachineInstrBuilder &MIB) { // soffset
5094 // Use constant zero for soffset and rely on eliminateFrameIndex
5095 // to choose the appropriate frame register if need be.
5096 MIB.addImm(0);
5097 },
5098 [=](MachineInstrBuilder &MIB) { // offset
5099 MIB.addImm(Offset & MaxOffset);
5100 }}};
5101 }
5102
5103 assert(Offset == 0 || Offset == -1);
5104
5105 // Try to fold a frame index directly into the MUBUF vaddr field, and any
5106 // offsets.
5107 std::optional<int> FI;
5108 Register VAddr = Root.getReg();
5109
5110 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5111 Register PtrBase;
5112 int64_t ConstOffset;
5113 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
5114 if (ConstOffset != 0) {
5115 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
5117 KB->signBitIsZero(PtrBase))) {
5118 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
5119 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
5120 FI = PtrBaseDef->getOperand(1).getIndex();
5121 else
5122 VAddr = PtrBase;
5123 Offset = ConstOffset;
5124 }
5125 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5126 FI = RootDef->getOperand(1).getIndex();
5127 }
5128
5129 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5130 MIB.addReg(Info->getScratchRSrcReg());
5131 },
5132 [=](MachineInstrBuilder &MIB) { // vaddr
5133 if (FI)
5134 MIB.addFrameIndex(*FI);
5135 else
5136 MIB.addReg(VAddr);
5137 },
5138 [=](MachineInstrBuilder &MIB) { // soffset
5139 // Use constant zero for soffset and rely on eliminateFrameIndex
5140 // to choose the appropriate frame register if need be.
5141 MIB.addImm(0);
5142 },
5143 [=](MachineInstrBuilder &MIB) { // offset
5144 MIB.addImm(Offset);
5145 }}};
5146}
5147
5148bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
5149 int64_t Offset) const {
5150 if (!isUInt<16>(Offset))
5151 return false;
5152
5154 return true;
5155
5156 // On Southern Islands instruction with a negative base value and an offset
5157 // don't seem to work.
5158 return KB->signBitIsZero(Base);
5159}
5160
5161bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
5162 int64_t Offset1,
5163 unsigned Size) const {
5164 if (Offset0 % Size != 0 || Offset1 % Size != 0)
5165 return false;
5166 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
5167 return false;
5168
5170 return true;
5171
5172 // On Southern Islands instruction with a negative base value and an offset
5173 // don't seem to work.
5174 return KB->signBitIsZero(Base);
5175}
5176
5177// Return whether the operation has NoUnsignedWrap property.
5179 return Addr->getOpcode() == TargetOpcode::G_OR ||
5180 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
5181 Addr->getFlag(MachineInstr::NoUWrap));
5182}
5183
5184// Check that the base address of flat scratch load/store in the form of `base +
5185// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
5186// requirement). We always treat the first operand as the base address here.
5187bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
5188 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5189
5190 if (isNoUnsignedWrap(AddrMI))
5191 return true;
5192
5193 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5194 // values.
5195 if (STI.hasSignedScratchOffsets())
5196 return true;
5197
5198 Register LHS = AddrMI->getOperand(1).getReg();
5199 Register RHS = AddrMI->getOperand(2).getReg();
5200
5201 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
5202 std::optional<ValueAndVReg> RhsValReg =
5204 // If the immediate offset is negative and within certain range, the base
5205 // address cannot also be negative. If the base is also negative, the sum
5206 // would be either negative or much larger than the valid range of scratch
5207 // memory a thread can access.
5208 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
5209 RhsValReg->Value.getSExtValue() > -0x40000000)
5210 return true;
5211 }
5212
5213 return KB->signBitIsZero(LHS);
5214}
5215
5216// Check address value in SGPR/VGPR are legal for flat scratch in the form
5217// of: SGPR + VGPR.
5218bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
5219 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5220
5221 if (isNoUnsignedWrap(AddrMI))
5222 return true;
5223
5224 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5225 // values.
5226 if (STI.hasSignedScratchOffsets())
5227 return true;
5228
5229 Register LHS = AddrMI->getOperand(1).getReg();
5230 Register RHS = AddrMI->getOperand(2).getReg();
5231 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
5232}
5233
5234// Check address value in SGPR/VGPR are legal for flat scratch in the form
5235// of: SGPR + VGPR + Imm.
5236bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
5237 Register Addr) const {
5238 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5239 // values.
5240 if (STI.hasSignedScratchOffsets())
5241 return true;
5242
5243 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5244 Register Base = AddrMI->getOperand(1).getReg();
5245 std::optional<DefinitionAndSourceRegister> BaseDef =
5247 std::optional<ValueAndVReg> RHSOffset =
5249 assert(RHSOffset);
5250
5251 // If the immediate offset is negative and within certain range, the base
5252 // address cannot also be negative. If the base is also negative, the sum
5253 // would be either negative or much larger than the valid range of scratch
5254 // memory a thread can access.
5255 if (isNoUnsignedWrap(BaseDef->MI) &&
5256 (isNoUnsignedWrap(AddrMI) ||
5257 (RHSOffset->Value.getSExtValue() < 0 &&
5258 RHSOffset->Value.getSExtValue() > -0x40000000)))
5259 return true;
5260
5261 Register LHS = BaseDef->MI->getOperand(1).getReg();
5262 Register RHS = BaseDef->MI->getOperand(2).getReg();
5263 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
5264}
5265
5266bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
5267 unsigned ShAmtBits) const {
5268 assert(MI.getOpcode() == TargetOpcode::G_AND);
5269
5270 std::optional<APInt> RHS =
5271 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
5272 if (!RHS)
5273 return false;
5274
5275 if (RHS->countr_one() >= ShAmtBits)
5276 return true;
5277
5278 const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
5279 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
5280}
5281
5283AMDGPUInstructionSelector::selectMUBUFScratchOffset(
5284 MachineOperand &Root) const {
5285 Register Reg = Root.getReg();
5287
5288 std::optional<DefinitionAndSourceRegister> Def =
5289 getDefSrcRegIgnoringCopies(Reg, *MRI);
5290 assert(Def && "this shouldn't be an optional result");
5291 Reg = Def->Reg;
5292
5293 if (Register WaveBase = getWaveAddress(Def->MI)) {
5294 return {{
5295 [=](MachineInstrBuilder &MIB) { // rsrc
5296 MIB.addReg(Info->getScratchRSrcReg());
5297 },
5298 [=](MachineInstrBuilder &MIB) { // soffset
5299 MIB.addReg(WaveBase);
5300 },
5301 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
5302 }};
5303 }
5304
5305 int64_t Offset = 0;
5306
5307 // FIXME: Copy check is a hack
5309 if (mi_match(Reg, *MRI,
5310 m_GPtrAdd(m_Reg(BasePtr),
5312 if (!TII.isLegalMUBUFImmOffset(Offset))
5313 return {};
5314 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
5315 Register WaveBase = getWaveAddress(BasePtrDef);
5316 if (!WaveBase)
5317 return {};
5318
5319 return {{
5320 [=](MachineInstrBuilder &MIB) { // rsrc
5321 MIB.addReg(Info->getScratchRSrcReg());
5322 },
5323 [=](MachineInstrBuilder &MIB) { // soffset
5324 MIB.addReg(WaveBase);
5325 },
5326 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
5327 }};
5328 }
5329
5330 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
5332 return {};
5333
5334 return {{
5335 [=](MachineInstrBuilder &MIB) { // rsrc
5336 MIB.addReg(Info->getScratchRSrcReg());
5337 },
5338 [=](MachineInstrBuilder &MIB) { // soffset
5339 MIB.addImm(0);
5340 },
5341 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
5342 }};
5343}
5344
5345std::pair<Register, unsigned>
5346AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
5347 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5348 int64_t ConstAddr = 0;
5349
5350 Register PtrBase;
5351 int64_t Offset;
5352 std::tie(PtrBase, Offset) =
5353 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5354
5355 if (Offset) {
5356 if (isDSOffsetLegal(PtrBase, Offset)) {
5357 // (add n0, c0)
5358 return std::pair(PtrBase, Offset);
5359 }
5360 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5361 // TODO
5362
5363
5364 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
5365 // TODO
5366
5367 }
5368
5369 return std::pair(Root.getReg(), 0);
5370}
5371
5373AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
5374 Register Reg;
5375 unsigned Offset;
5376 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
5377 return {{
5378 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5379 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
5380 }};
5381}
5382
5384AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
5385 return selectDSReadWrite2(Root, 4);
5386}
5387
5389AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
5390 return selectDSReadWrite2(Root, 8);
5391}
5392
5394AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
5395 unsigned Size) const {
5396 Register Reg;
5397 unsigned Offset;
5398 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
5399 return {{
5400 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5401 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5402 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
5403 }};
5404}
5405
5406std::pair<Register, unsigned>
5407AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
5408 unsigned Size) const {
5409 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5410 int64_t ConstAddr = 0;
5411
5412 Register PtrBase;
5413 int64_t Offset;
5414 std::tie(PtrBase, Offset) =
5415 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5416
5417 if (Offset) {
5418 int64_t OffsetValue0 = Offset;
5419 int64_t OffsetValue1 = Offset + Size;
5420 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
5421 // (add n0, c0)
5422 return std::pair(PtrBase, OffsetValue0 / Size);
5423 }
5424 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5425 // TODO
5426
5427 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
5428 // TODO
5429
5430 }
5431
5432 return std::pair(Root.getReg(), 0);
5433}
5434
5435/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
5436/// the base value with the constant offset. There may be intervening copies
5437/// between \p Root and the identified constant. Returns \p Root, 0 if this does
5438/// not match the pattern.
5439std::pair<Register, int64_t>
5440AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
5441 Register Root, const MachineRegisterInfo &MRI) const {
5442 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
5443 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
5444 return {Root, 0};
5445
5446 MachineOperand &RHS = RootI->getOperand(2);
5447 std::optional<ValueAndVReg> MaybeOffset =
5449 if (!MaybeOffset)
5450 return {Root, 0};
5451 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
5452}
5453
5455 MIB.addImm(0);
5456}
5457
5458/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5459/// BasePtr is not valid, a null base pointer will be used.
5461 uint32_t FormatLo, uint32_t FormatHi,
5462 Register BasePtr) {
5463 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5464 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5465 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5466 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5467
5468 B.buildInstr(AMDGPU::S_MOV_B32)
5469 .addDef(RSrc2)
5470 .addImm(FormatLo);
5471 B.buildInstr(AMDGPU::S_MOV_B32)
5472 .addDef(RSrc3)
5473 .addImm(FormatHi);
5474
5475 // Build the half of the subregister with the constants before building the
5476 // full 128-bit register. If we are building multiple resource descriptors,
5477 // this will allow CSEing of the 2-component register.
5478 B.buildInstr(AMDGPU::REG_SEQUENCE)
5479 .addDef(RSrcHi)
5480 .addReg(RSrc2)
5481 .addImm(AMDGPU::sub0)
5482 .addReg(RSrc3)
5483 .addImm(AMDGPU::sub1);
5484
5485 Register RSrcLo = BasePtr;
5486 if (!BasePtr) {
5487 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5488 B.buildInstr(AMDGPU::S_MOV_B64)
5489 .addDef(RSrcLo)
5490 .addImm(0);
5491 }
5492
5493 B.buildInstr(AMDGPU::REG_SEQUENCE)
5494 .addDef(RSrc)
5495 .addReg(RSrcLo)
5496 .addImm(AMDGPU::sub0_sub1)
5497 .addReg(RSrcHi)
5498 .addImm(AMDGPU::sub2_sub3);
5499
5500 return RSrc;
5501}
5502
5504 const SIInstrInfo &TII, Register BasePtr) {
5505 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5506
5507 // FIXME: Why are half the "default" bits ignored based on the addressing
5508 // mode?
5509 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
5510}
5511
5513 const SIInstrInfo &TII, Register BasePtr) {
5514 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5515
5516 // FIXME: Why are half the "default" bits ignored based on the addressing
5517 // mode?
5518 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
5519}
5520
5521AMDGPUInstructionSelector::MUBUFAddressData
5522AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
5523 MUBUFAddressData Data;
5524 Data.N0 = Src;
5525
5526 Register PtrBase;
5527 int64_t Offset;
5528
5529 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
5530 if (isUInt<32>(Offset)) {
5531 Data.N0 = PtrBase;
5532 Data.Offset = Offset;
5533 }
5534
5535 if (MachineInstr *InputAdd
5536 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
5537 Data.N2 = InputAdd->getOperand(1).getReg();
5538 Data.N3 = InputAdd->getOperand(2).getReg();
5539
5540 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5541 // FIXME: Don't know this was defined by operand 0
5542 //
5543 // TODO: Remove this when we have copy folding optimizations after
5544 // RegBankSelect.
5545 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
5546 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
5547 }
5548
5549 return Data;
5550}
5551
5552/// Return if the addr64 mubuf mode should be used for the given address.
5553bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
5554 // (ptr_add N2, N3) -> addr64, or
5555 // (ptr_add (ptr_add N2, N3), C1) -> addr64
5556 if (Addr.N2)
5557 return true;
5558
5559 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
5560 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5561}
5562
5563/// Split an immediate offset \p ImmOffset depending on whether it fits in the
5564/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5565/// component.
5566void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5567 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
5568 if (TII.isLegalMUBUFImmOffset(ImmOffset))
5569 return;
5570
5571 // Illegal offset, store it in soffset.
5572 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5573 B.buildInstr(AMDGPU::S_MOV_B32)
5574 .addDef(SOffset)
5575 .addImm(ImmOffset);
5576 ImmOffset = 0;
5577}
5578
5579bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5580 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
5581 Register &SOffset, int64_t &Offset) const {
5582 // FIXME: Predicates should stop this from reaching here.
5583 // addr64 bit was removed for volcanic islands.
5584 if (!STI.hasAddr64() || STI.useFlatForGlobal())
5585 return false;
5586
5587 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5588 if (!shouldUseAddr64(AddrData))
5589 return false;
5590
5591 Register N0 = AddrData.N0;
5592 Register N2 = AddrData.N2;
5593 Register N3 = AddrData.N3;
5594 Offset = AddrData.Offset;
5595
5596 // Base pointer for the SRD.
5597 Register SRDPtr;
5598
5599 if (N2) {
5600 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5601 assert(N3);
5602 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5603 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5604 // addr64, and construct the default resource from a 0 address.
5605 VAddr = N0;
5606 } else {
5607 SRDPtr = N3;
5608 VAddr = N2;
5609 }
5610 } else {
5611 // N2 is not divergent.
5612 SRDPtr = N2;
5613 VAddr = N3;
5614 }
5615 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5616 // Use the default null pointer in the resource
5617 VAddr = N0;
5618 } else {
5619 // N0 -> offset, or
5620 // (N0 + C1) -> offset
5621 SRDPtr = N0;
5622 }
5623
5624 MachineIRBuilder B(*Root.getParent());
5625 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
5626 splitIllegalMUBUFOffset(B, SOffset, Offset);
5627 return true;
5628}
5629
5630bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5631 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
5632 int64_t &Offset) const {
5633
5634 // FIXME: Pattern should not reach here.
5635 if (STI.useFlatForGlobal())
5636 return false;
5637
5638 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5639 if (shouldUseAddr64(AddrData))
5640 return false;
5641
5642 // N0 -> offset, or
5643 // (N0 + C1) -> offset
5644 Register SRDPtr = AddrData.N0;
5645 Offset = AddrData.Offset;
5646
5647 // TODO: Look through extensions for 32-bit soffset.
5648 MachineIRBuilder B(*Root.getParent());
5649
5650 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
5651 splitIllegalMUBUFOffset(B, SOffset, Offset);
5652 return true;
5653}
5654
5656AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
5657 Register VAddr;
5658 Register RSrcReg;
5659 Register SOffset;
5660 int64_t Offset = 0;
5661
5662 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
5663 return {};
5664
5665 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
5666 // pattern.
5667 return {{
5668 [=](MachineInstrBuilder &MIB) { // rsrc
5669 MIB.addReg(RSrcReg);
5670 },
5671 [=](MachineInstrBuilder &MIB) { // vaddr
5672 MIB.addReg(VAddr);
5673 },
5674 [=](MachineInstrBuilder &MIB) { // soffset
5675 if (SOffset)
5676 MIB.addReg(SOffset);
5677 else if (STI.hasRestrictedSOffset())
5678 MIB.addReg(AMDGPU::SGPR_NULL);
5679 else
5680 MIB.addImm(0);
5681 },
5682 [=](MachineInstrBuilder &MIB) { // offset
5683 MIB.addImm(Offset);
5684 },
5685 addZeroImm, // cpol
5686 addZeroImm, // tfe
5687 addZeroImm // swz
5688 }};
5689}
5690
5692AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
5693 Register RSrcReg;
5694 Register SOffset;
5695 int64_t Offset = 0;
5696
5697 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
5698 return {};
5699
5700 return {{
5701 [=](MachineInstrBuilder &MIB) { // rsrc
5702 MIB.addReg(RSrcReg);
5703 },
5704 [=](MachineInstrBuilder &MIB) { // soffset
5705 if (SOffset)
5706 MIB.addReg(SOffset);
5707 else if (STI.hasRestrictedSOffset())
5708 MIB.addReg(AMDGPU::SGPR_NULL);
5709 else
5710 MIB.addImm(0);
5711 },
5712 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
5713 addZeroImm, // cpol
5714 addZeroImm, // tfe
5715 addZeroImm, // swz
5716 }};
5717}
5718
5720AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
5721
5722 Register SOffset = Root.getReg();
5723
5724 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
5725 SOffset = AMDGPU::SGPR_NULL;
5726
5727 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5728}
5729
5730/// Get an immediate that must be 32-bits, and treated as zero extended.
5731static std::optional<uint64_t>
5733 // getIConstantVRegVal sexts any values, so see if that matters.
5734 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
5735 if (!OffsetVal || !isInt<32>(*OffsetVal))
5736 return std::nullopt;
5737 return Lo_32(*OffsetVal);
5738}
5739
5741AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5742 std::optional<uint64_t> OffsetVal =
5743 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
5744 if (!OffsetVal)
5745 return {};
5746
5747 std::optional<int64_t> EncodedImm =
5748 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
5749 if (!EncodedImm)
5750 return {};
5751
5752 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5753}
5754
5756AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
5758
5759 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5760 if (!OffsetVal)
5761 return {};
5762
5763 std::optional<int64_t> EncodedImm =
5765 if (!EncodedImm)
5766 return {};
5767
5768 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5769}
5770
5772AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
5773 // Match the (soffset + offset) pair as a 32-bit register base and
5774 // an immediate offset.
5775 Register SOffset;
5776 unsigned Offset;
5777 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
5778 *MRI, Root.getReg(), KB, /*CheckNUW*/ true);
5779 if (!SOffset)
5780 return std::nullopt;
5781
5782 std::optional<int64_t> EncodedOffset =
5783 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
5784 if (!EncodedOffset)
5785 return std::nullopt;
5786
5787 assert(MRI->getType(SOffset) == LLT::scalar(32));
5788 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5789 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
5790}
5791
5792std::pair<Register, unsigned>
5793AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5794 bool &Matched) const {
5795 Matched = false;
5796
5797 Register Src;
5798 unsigned Mods;
5799 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5800
5801 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
5802 assert(MRI->getType(Src) == LLT::scalar(16));
5803
5804 // Only change Src if src modifier could be gained. In such cases new Src
5805 // could be sgpr but this does not violate constant bus restriction for
5806 // instruction that is being selected.
5807 Src = stripBitCast(Src, *MRI);
5808
5809 const auto CheckAbsNeg = [&]() {
5810 // Be careful about folding modifiers if we already have an abs. fneg is
5811 // applied last, so we don't want to apply an earlier fneg.
5812 if ((Mods & SISrcMods::ABS) == 0) {
5813 unsigned ModsTmp;
5814 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
5815
5816 if ((ModsTmp & SISrcMods::NEG) != 0)
5817 Mods ^= SISrcMods::NEG;
5818
5819 if ((ModsTmp & SISrcMods::ABS) != 0)
5820 Mods |= SISrcMods::ABS;
5821 }
5822 };
5823
5824 CheckAbsNeg();
5825
5826 // op_sel/op_sel_hi decide the source type and source.
5827 // If the source's op_sel_hi is set, it indicates to do a conversion from
5828 // fp16. If the sources's op_sel is set, it picks the high half of the
5829 // source register.
5830
5831 Mods |= SISrcMods::OP_SEL_1;
5832
5833 if (isExtractHiElt(*MRI, Src, Src)) {
5834 Mods |= SISrcMods::OP_SEL_0;
5835 CheckAbsNeg();
5836 }
5837
5838 Matched = true;
5839 }
5840
5841 return {Src, Mods};
5842}
5843
5845AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5846 MachineOperand &Root) const {
5847 Register Src;
5848 unsigned Mods;
5849 bool Matched;
5850 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5851 if (!Matched)
5852 return {};
5853
5854 return {{
5855 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5856 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5857 }};
5858}
5859
5861AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
5862 Register Src;
5863 unsigned Mods;
5864 bool Matched;
5865 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5866
5867 return {{
5868 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5869 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5870 }};
5871}
5872
5873bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5874 MachineInstr &I, Intrinsic::ID IntrID) const {
5875 MachineBasicBlock *MBB = I.getParent();
5876 const DebugLoc &DL = I.getDebugLoc();
5877 Register CCReg = I.getOperand(0).getReg();
5878
5879 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5880 .addImm(I.getOperand(2).getImm());
5881
5882 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5883
5884 I.eraseFromParent();
5885 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5886 *MRI);
5887}
5888
5889bool AMDGPUInstructionSelector::selectSGetBarrierState(
5890 MachineInstr &I, Intrinsic::ID IntrID) const {
5891 MachineBasicBlock *MBB = I.getParent();
5892 const DebugLoc &DL = I.getDebugLoc();
5893 MachineOperand BarOp = I.getOperand(2);
5894 std::optional<int64_t> BarValImm =
5895 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5896
5897 if (!BarValImm) {
5898 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5899 .addReg(BarOp.getReg());
5900 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5901 }
5903 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
5904 : AMDGPU::S_GET_BARRIER_STATE_M0;
5905 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5906
5907 auto DstReg = I.getOperand(0).getReg();
5908 const TargetRegisterClass *DstRC =
5909 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
5910 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
5911 return false;
5912 MIB.addDef(DstReg);
5913 if (BarValImm) {
5914 MIB.addImm(*BarValImm);
5915 }
5916 I.eraseFromParent();
5917 return true;
5918}
5919
5920unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
5921 if (HasInlineConst) {
5922 switch (IntrID) {
5923 default:
5924 llvm_unreachable("not a named barrier op");
5925 case Intrinsic::amdgcn_s_barrier_join:
5926 return AMDGPU::S_BARRIER_JOIN_IMM;
5927 case Intrinsic::amdgcn_s_get_named_barrier_state:
5928 return AMDGPU::S_GET_BARRIER_STATE_IMM;
5929 };
5930 } else {
5931 switch (IntrID) {
5932 default:
5933 llvm_unreachable("not a named barrier op");
5934 case Intrinsic::amdgcn_s_barrier_join:
5935 return AMDGPU::S_BARRIER_JOIN_M0;
5936 case Intrinsic::amdgcn_s_get_named_barrier_state:
5937 return AMDGPU::S_GET_BARRIER_STATE_M0;
5938 };
5939 }
5940}
5941
5942bool AMDGPUInstructionSelector::selectNamedBarrierInit(
5943 MachineInstr &I, Intrinsic::ID IntrID) const {
5944 MachineBasicBlock *MBB = I.getParent();
5945 const DebugLoc &DL = I.getDebugLoc();
5946 MachineOperand BarOp = I.getOperand(1);
5947 MachineOperand CntOp = I.getOperand(2);
5948
5949 // BarID = (BarOp >> 4) & 0x3F
5950 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5951 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
5952 .add(BarOp)
5953 .addImm(4u)
5954 .setOperandDead(3); // Dead scc
5955
5956 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5957 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
5958 .addReg(TmpReg0)
5959 .addImm(0x3F)
5960 .setOperandDead(3); // Dead scc
5961
5962 // MO = ((CntOp & 0x3F) << shAmt) | BarID
5963 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5964 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
5965 .add(CntOp)
5966 .addImm(0x3F)
5967 .setOperandDead(3); // Dead scc
5968
5969 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5970 constexpr unsigned ShAmt = 16;
5971 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
5972 .addReg(TmpReg2)
5973 .addImm(ShAmt)
5974 .setOperandDead(3); // Dead scc
5975
5976 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5977 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
5978 .addReg(TmpReg1)
5979 .addReg(TmpReg3)
5980 .setOperandDead(3); // Dead scc;
5981
5982 auto CopyMIB =
5983 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
5984 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5985
5986 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
5987 ? AMDGPU::S_BARRIER_INIT_M0
5988 : AMDGPU::S_BARRIER_SIGNAL_M0;
5990 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5991
5992 I.eraseFromParent();
5993 return true;
5994}
5995
5996bool AMDGPUInstructionSelector::selectNamedBarrierInst(
5997 MachineInstr &I, Intrinsic::ID IntrID) const {
5998 MachineBasicBlock *MBB = I.getParent();
5999 const DebugLoc &DL = I.getDebugLoc();
6000 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
6001 ? I.getOperand(2)
6002 : I.getOperand(1);
6003 std::optional<int64_t> BarValImm =
6004 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6005
6006 if (!BarValImm) {
6007 // BarID = (BarOp >> 4) & 0x3F
6008 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6009 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6010 .addReg(BarOp.getReg())
6011 .addImm(4u)
6012 .setOperandDead(3); // Dead scc;
6013
6014 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6015 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6016 .addReg(TmpReg0)
6017 .addImm(0x3F)
6018 .setOperandDead(3); // Dead scc;
6019
6020 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6021 .addReg(TmpReg1);
6022 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6023 }
6024
6026 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
6027 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6028
6029 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
6030 auto DstReg = I.getOperand(0).getReg();
6031 const TargetRegisterClass *DstRC =
6032 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6033 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6034 return false;
6035 MIB.addDef(DstReg);
6036 }
6037
6038 if (BarValImm) {
6039 auto BarId = ((*BarValImm) >> 4) & 0x3F;
6040 MIB.addImm(BarId);
6041 }
6042
6043 I.eraseFromParent();
6044 return true;
6045}
6046
6047void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
6048 const MachineInstr &MI,
6049 int OpIdx) const {
6050 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6051 "Expected G_CONSTANT");
6052 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
6053}
6054
6055void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
6056 const MachineInstr &MI,
6057 int OpIdx) const {
6058 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6059 "Expected G_CONSTANT");
6060 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
6061}
6062
6063void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
6064 const MachineInstr &MI,
6065 int OpIdx) const {
6066 const MachineOperand &Op = MI.getOperand(1);
6067 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
6068 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
6069}
6070
6071void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
6072 const MachineInstr &MI,
6073 int OpIdx) const {
6074 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6075 "Expected G_CONSTANT");
6076 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
6077}
6078
6079/// This only really exists to satisfy DAG type checking machinery, so is a
6080/// no-op here.
6081void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
6082 const MachineInstr &MI,
6083 int OpIdx) const {
6084 const MachineOperand &Op = MI.getOperand(OpIdx);
6085 int64_t Imm;
6086 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
6087 MIB.addImm(Imm);
6088 else
6089 MIB.addImm(Op.getImm());
6090}
6091
6092void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
6093 const MachineInstr &MI,
6094 int OpIdx) const {
6095 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
6096}
6097
6098void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
6099 const MachineInstr &MI,
6100 int OpIdx) const {
6101 assert(OpIdx >= 0 && "expected to match an immediate operand");
6102 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6103}
6104
6105void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
6106 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6107 assert(OpIdx >= 0 && "expected to match an immediate operand");
6108 MIB.addImm(
6109 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6110}
6111
6112void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
6113 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6114 assert(OpIdx >= 0 && "expected to match an immediate operand");
6115 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
6117 : (int64_t)SISrcMods::DST_OP_SEL);
6118}
6119
6120void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
6121 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6122 assert(OpIdx >= 0 && "expected to match an immediate operand");
6123 MIB.addImm(
6124 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6125}
6126
6127void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
6128 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6129 assert(OpIdx >= 0 && "expected to match an immediate operand");
6130 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
6131 ? (int64_t)(SISrcMods::OP_SEL_0)
6132 : 0);
6133}
6134
6135void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
6136 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6137 assert(OpIdx >= 0 && "expected to match an immediate operand");
6138 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
6139 : 0);
6140}
6141
6142void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
6143 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6144 assert(OpIdx >= 0 && "expected to match an immediate operand");
6145 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
6146 : 0);
6147}
6148
6149void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
6150 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6151 assert(OpIdx >= 0 && "expected to match an immediate operand");
6152 MIB.addImm(
6153 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6154}
6155
6156void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
6157 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6158 assert(OpIdx >= 0 && "expected to match an immediate operand");
6159 MIB.addImm(
6160 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0);
6161}
6162
6163void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
6164 const MachineInstr &MI,
6165 int OpIdx) const {
6166 assert(OpIdx >= 0 && "expected to match an immediate operand");
6167 MIB.addImm(MI.getOperand(OpIdx).getImm() &
6170}
6171
6172void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
6173 const MachineInstr &MI,
6174 int OpIdx) const {
6175 assert(OpIdx >= 0 && "expected to match an immediate operand");
6176 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
6179 MIB.addImm(Swizzle);
6180}
6181
6182void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
6183 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6184 assert(OpIdx >= 0 && "expected to match an immediate operand");
6185 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
6188 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
6189}
6190
6191void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
6192 const MachineInstr &MI,
6193 int OpIdx) const {
6194 MIB.addFrameIndex(MI.getOperand(1).getIndex());
6195}
6196
6197void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
6198 const MachineInstr &MI,
6199 int OpIdx) const {
6200 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
6201 int ExpVal = APF.getExactLog2Abs();
6202 assert(ExpVal != INT_MIN);
6203 MIB.addImm(ExpVal);
6204}
6205
6206void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
6207 const MachineInstr &MI,
6208 int OpIdx) const {
6209 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
6210 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
6211 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
6212 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
6213 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
6214}
6215
6216/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
6217void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
6218 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6219 unsigned Val = MI.getOperand(OpIdx).getImm();
6220 unsigned New = 0;
6221 if (Val & 0x1)
6223 if (Val & 0x2)
6225 MIB.addImm(New);
6226}
6227
6228bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
6229 return TII.isInlineConstant(Imm);
6230}
6231
6232bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
6233 return TII.isInlineConstant(Imm);
6234}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelKnownBits &KnownBits)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg)
Match a zero extend from a 32-bit value to 64-bits.
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
static const LLT S1
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Value * RHS
Value * LHS
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelKnownBits *KB, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1489
Class for arbitrary precision integers.
Definition: APInt.h:78
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:986
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1635
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:676
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:690
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:688
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:678
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:687
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:681
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:685
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:680
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:682
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:689
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:686
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:675
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
bool isFPPredicate() const
Definition: InstrTypes.h:780
bool isIntPredicate() const
Definition: InstrTypes.h:781
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:271
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:163
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:157
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Represents a G_BUILD_VECTOR.
bool useVGPRIndexMode() const
bool hasPermlane32Swap() const
bool hasScalarCompareEq64() const
int getLDSBankCount() const
Definition: GCNSubtarget.h:350
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:478
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:482
bool hasBitOp3Insts() const
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:563
bool hasSignedScratchOffsets() const
bool hasRestrictedSOffset() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:287
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:716
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool hasG16() const
bool hasPermlane16Swap() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:541
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasSplitBarriers() const
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:746
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:730
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasSALUFloatInsts() const
bool hasPartialNSAEncoding() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
Represents a G_CONCAT_VECTORS.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelKnownBits *kb, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
APInt getKnownOnes(Register R)
KnownBits getKnownBits(Register R)
bool signBitIsZero(Register Op)
APInt getKnownZeroes(Register R)
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:657
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr bool isValid() const
Definition: LowLevelType.h:145
constexpr bool isVector() const
Definition: LowLevelType.h:148
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:190
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:277
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:270
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Metadata node.
Definition: Metadata.h:1073
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
void setReturnAddressIsTaken(bool s)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:71
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:577
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:349
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:580
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:501
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:587
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Analysis providing profile information.
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
TypeSize getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCRegister getReturnAddressReg(const MachineFunction &MF) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
const TargetRegisterClass * getBoolRC() const
MCRegister getExec() const
const TargetRegisterClass * getWaveMaskRegClass() const
static bool isSGPRClass(const TargetRegisterClass *RC)
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:404
static IntegerType * getInt32Ty(LLVMContext &C)
LLVM Value Representation.
Definition: Value.h:74
Value(Type *Ty, unsigned scid)
Definition: Value.cpp:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
Key
PAL metadata keys.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
IndexMode
ARM Index Modes.
Definition: ARMBaseInfo.h:177
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:756
operand_type_match m_Reg()
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
SpecificConstantMatch m_SpecificICst(int64_t RequestedValue)
Matches a constant equal to RequestedValue.
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:910
@ Offset
Definition: DWP.cpp:480
Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition: Utils.cpp:56
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:645
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:459
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:294
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:155
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:486
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:314
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:155
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition: Utils.cpp:439
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:160
unsigned getUndefRegState(bool B)
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ Add
Sum of integers.
DWARFExpression::Operation Op
@ DS_Error
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:433
std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition: Utils.cpp:467
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:493
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:293
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.