LLVM 22.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
32
33#ifdef EXPENSIVE_CHECKS
35#include "llvm/IR/Dominators.h"
36#endif
37
38#define DEBUG_TYPE "amdgpu-isel"
39
40using namespace llvm;
41
42//===----------------------------------------------------------------------===//
43// Instruction Selector Implementation
44//===----------------------------------------------------------------------===//
45
46namespace {
47static SDValue stripBitcast(SDValue Val) {
48 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
49}
50
51// Figure out if this is really an extract of the high 16-bits of a dword.
52static bool isExtractHiElt(SDValue In, SDValue &Out) {
53 In = stripBitcast(In);
54
55 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
56 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
57 if (!Idx->isOne())
58 return false;
59 Out = In.getOperand(0);
60 return true;
61 }
62 }
63
64 if (In.getOpcode() != ISD::TRUNCATE)
65 return false;
66
67 SDValue Srl = In.getOperand(0);
68 if (Srl.getOpcode() == ISD::SRL) {
69 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
70 if (ShiftAmt->getZExtValue() == 16) {
71 Out = stripBitcast(Srl.getOperand(0));
72 return true;
73 }
74 }
75 }
76
77 return false;
78}
79
80static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,
81 llvm::SelectionDAG *CurDAG,
82 const GCNSubtarget *Subtarget) {
83 if (!Subtarget->useRealTrue16Insts()) {
84 return Lo;
85 }
86
87 SDValue NewSrc;
88 SDLoc SL(Lo);
89
90 if (Lo->isDivergent()) {
91 SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
92 SL, Lo.getValueType()),
93 0);
94 const SDValue Ops[] = {
95 CurDAG->getTargetConstant(AMDGPU::VGPR_32RegClassID, SL, MVT::i32), Lo,
96 CurDAG->getTargetConstant(AMDGPU::lo16, SL, MVT::i16), Undef,
97 CurDAG->getTargetConstant(AMDGPU::hi16, SL, MVT::i16)};
98
99 NewSrc = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
100 Src.getValueType(), Ops),
101 0);
102 } else {
103 // the S_MOV is needed since the Lo could still be a VGPR16.
104 // With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on
105 // the fixvgpr2sgprcopy pass to legalize it
106 NewSrc = SDValue(
107 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, Src.getValueType(), Lo),
108 0);
109 }
110
111 return NewSrc;
112}
113
114// Look through operations that obscure just looking at the low 16-bits of the
115// same register.
116static SDValue stripExtractLoElt(SDValue In) {
117 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
118 SDValue Idx = In.getOperand(1);
119 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
120 return In.getOperand(0);
121 }
122
123 if (In.getOpcode() == ISD::TRUNCATE) {
124 SDValue Src = In.getOperand(0);
125 if (Src.getValueType().getSizeInBits() == 32)
126 return stripBitcast(Src);
127 }
128
129 return In;
130}
131
132} // end anonymous namespace
133
135 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
136 false)
138INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
140#ifdef EXPENSIVE_CHECKS
143#endif
145 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
146 false)
147
148/// This pass converts a legalized DAG into a AMDGPU-specific
149// DAG, ready for instruction scheduling.
151 CodeGenOptLevel OptLevel) {
152 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
153}
154
156 CodeGenOptLevel OptLevel)
157 : SelectionDAGISel(TM, OptLevel) {}
158
160 Subtarget = &MF.getSubtarget<GCNSubtarget>();
162 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
164}
165
166bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
167 // XXX - only need to list legal operations.
168 switch (Opc) {
169 case ISD::FADD:
170 case ISD::FSUB:
171 case ISD::FMUL:
172 case ISD::FDIV:
173 case ISD::FREM:
175 case ISD::UINT_TO_FP:
176 case ISD::SINT_TO_FP:
177 case ISD::FABS:
178 // Fabs is lowered to a bit operation, but it's an and which will clear the
179 // high bits anyway.
180 case ISD::FSQRT:
181 case ISD::FSIN:
182 case ISD::FCOS:
183 case ISD::FPOWI:
184 case ISD::FPOW:
185 case ISD::FLOG:
186 case ISD::FLOG2:
187 case ISD::FLOG10:
188 case ISD::FEXP:
189 case ISD::FEXP2:
190 case ISD::FCEIL:
191 case ISD::FTRUNC:
192 case ISD::FRINT:
193 case ISD::FNEARBYINT:
194 case ISD::FROUNDEVEN:
195 case ISD::FROUND:
196 case ISD::FFLOOR:
197 case ISD::FMINNUM:
198 case ISD::FMAXNUM:
199 case ISD::FLDEXP:
200 case AMDGPUISD::FRACT:
201 case AMDGPUISD::CLAMP:
204 case AMDGPUISD::FMIN3:
205 case AMDGPUISD::FMAX3:
206 case AMDGPUISD::FMED3:
208 case AMDGPUISD::RCP:
209 case AMDGPUISD::RSQ:
211 // On gfx10, all 16-bit instructions preserve the high bits.
212 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
213 case ISD::FP_ROUND:
214 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
215 // high bits on gfx9.
216 // TODO: If we had the source node we could see if the source was fma/mad
218 case ISD::FMA:
219 case ISD::FMAD:
222 default:
223 // fcopysign, select and others may be lowered to 32-bit bit operations
224 // which don't zero the high bits.
225 return false;
226 }
227}
228
230#ifdef EXPENSIVE_CHECKS
231 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
232 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
233 for (auto &L : LI->getLoopsInPreorder()) {
234 assert(L->isLCSSAForm(DT));
235 }
236#endif
238}
239
243#ifdef EXPENSIVE_CHECKS
246#endif
248}
249
251 assert(Subtarget->d16PreservesUnusedBits());
252 MVT VT = N->getValueType(0).getSimpleVT();
253 if (VT != MVT::v2i16 && VT != MVT::v2f16)
254 return false;
255
256 SDValue Lo = N->getOperand(0);
257 SDValue Hi = N->getOperand(1);
258
259 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
260
261 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
262 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
263 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
264
265 // Need to check for possible indirect dependencies on the other half of the
266 // vector to avoid introducing a cycle.
267 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
268 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
269
271 SDValue Ops[] = {
272 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
273 };
274
275 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
276 if (LdHi->getMemoryVT() == MVT::i8) {
277 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
279 } else {
280 assert(LdHi->getMemoryVT() == MVT::i16);
281 }
282
283 SDValue NewLoadHi =
284 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
285 Ops, LdHi->getMemoryVT(),
286 LdHi->getMemOperand());
287
288 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
289 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
290 return true;
291 }
292
293 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
294 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
295 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
296 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
297 if (LdLo && Lo.hasOneUse()) {
298 SDValue TiedIn = getHi16Elt(Hi);
299 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
300 return false;
301
302 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
303 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
304 if (LdLo->getMemoryVT() == MVT::i8) {
305 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
307 } else {
308 assert(LdLo->getMemoryVT() == MVT::i16);
309 }
310
311 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
312
313 SDValue Ops[] = {
314 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
315 };
316
317 SDValue NewLoadLo =
318 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
319 Ops, LdLo->getMemoryVT(),
320 LdLo->getMemOperand());
321
322 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
323 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
324 return true;
325 }
326
327 return false;
328}
329
331 if (!Subtarget->d16PreservesUnusedBits())
332 return;
333
335
336 bool MadeChange = false;
337 while (Position != CurDAG->allnodes_begin()) {
338 SDNode *N = &*--Position;
339 if (N->use_empty())
340 continue;
341
342 switch (N->getOpcode()) {
344 // TODO: Match load d16 from shl (extload:i16), 16
345 MadeChange |= matchLoadD16FromBuildVector(N);
346 break;
347 default:
348 break;
349 }
350 }
351
352 if (MadeChange) {
354 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
355 CurDAG->dump(););
356 }
357}
358
359bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
360 if (N->isUndef())
361 return true;
362
363 const SIInstrInfo *TII = Subtarget->getInstrInfo();
364 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
365 return TII->isInlineConstant(C->getAPIntValue());
366
367 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
368 return TII->isInlineConstant(C->getValueAPF());
369
370 return false;
371}
372
373/// Determine the register class for \p OpNo
374/// \returns The register class of the virtual register that will be used for
375/// the given operand number \OpNo or NULL if the register class cannot be
376/// determined.
377const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
378 unsigned OpNo) const {
379 if (!N->isMachineOpcode()) {
380 if (N->getOpcode() == ISD::CopyToReg) {
381 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
382 if (Reg.isVirtual()) {
384 return MRI.getRegClass(Reg);
385 }
386
387 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
388 return TRI->getPhysRegBaseClass(Reg);
389 }
390
391 return nullptr;
392 }
393
394 switch (N->getMachineOpcode()) {
395 default: {
396 const MCInstrDesc &Desc =
397 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
398 unsigned OpIdx = Desc.getNumDefs() + OpNo;
399 if (OpIdx >= Desc.getNumOperands())
400 return nullptr;
401 int RegClass = Desc.operands()[OpIdx].RegClass;
402 if (RegClass == -1)
403 return nullptr;
404
405 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
406 }
407 case AMDGPU::REG_SEQUENCE: {
408 unsigned RCID = N->getConstantOperandVal(0);
409 const TargetRegisterClass *SuperRC =
410 Subtarget->getRegisterInfo()->getRegClass(RCID);
411
412 SDValue SubRegOp = N->getOperand(OpNo + 1);
413 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
414 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
415 SubRegIdx);
416 }
417 }
418}
419
420SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
421 SDValue Glue) const {
422 SmallVector <SDValue, 8> Ops;
423 Ops.push_back(NewChain); // Replace the chain.
424 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
425 Ops.push_back(N->getOperand(i));
426
427 Ops.push_back(Glue);
428 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
429}
430
431SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
433 *static_cast<const SITargetLowering*>(getTargetLowering());
434
435 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
436
437 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
438 return glueCopyToOp(N, M0, M0.getValue(1));
439}
440
441SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
442 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
443 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
444 if (Subtarget->ldsRequiresM0Init())
445 return glueCopyToM0(
446 N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));
447 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
449 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
450 return
451 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
452 }
453 return N;
454}
455
456MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
457 EVT VT) const {
459 AMDGPU::S_MOV_B32, DL, MVT::i32,
460 CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32));
462 AMDGPU::S_MOV_B32, DL, MVT::i32,
463 CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32));
464 const SDValue Ops[] = {
465 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
466 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
467 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
468
469 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
470}
471
472void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
473 EVT VT = N->getValueType(0);
474 unsigned NumVectorElts = VT.getVectorNumElements();
475 EVT EltVT = VT.getVectorElementType();
476 SDLoc DL(N);
477 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
478
479 if (NumVectorElts == 1) {
480 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
481 RegClass);
482 return;
483 }
484
485 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
486 if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
488 uint64_t C = 0;
489 bool AllConst = true;
490 unsigned EltSize = EltVT.getSizeInBits();
491 for (unsigned I = 0; I < NumVectorElts; ++I) {
492 SDValue Op = N->getOperand(I);
493 if (Op.isUndef()) {
494 AllConst = false;
495 break;
496 }
497 uint64_t Val;
498 if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Op)) {
499 Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
500 } else
501 Val = cast<ConstantSDNode>(Op)->getZExtValue();
502 C |= Val << (EltSize * I);
503 }
504 if (AllConst) {
505 SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64);
506 MachineSDNode *Copy =
507 CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, VT, CV);
508 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0),
509 RegClass);
510 return;
511 }
512 }
513
514 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
515 "supported yet");
516 // 32 = Max Num Vector Elements
517 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
518 // 1 = Vector Register Class
519 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
520
521 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
522 bool IsRegSeq = true;
523 unsigned NOps = N->getNumOperands();
524 for (unsigned i = 0; i < NOps; i++) {
525 // XXX: Why is this here?
526 if (isa<RegisterSDNode>(N->getOperand(i))) {
527 IsRegSeq = false;
528 break;
529 }
530 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
532 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
533 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
534 }
535 if (NOps != NumVectorElts) {
536 // Fill in the missing undef elements if this was a scalar_to_vector.
537 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
538 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
539 DL, EltVT);
540 for (unsigned i = NOps; i < NumVectorElts; ++i) {
541 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
543 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
544 RegSeqArgs[1 + (2 * i) + 1] =
545 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
546 }
547 }
548
549 if (!IsRegSeq)
550 SelectCode(N);
551 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
552}
553
555 EVT VT = N->getValueType(0);
556 EVT EltVT = VT.getVectorElementType();
557
558 // TODO: Handle 16-bit element vectors with even aligned masks.
559 if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
560 VT.getVectorNumElements() != 2) {
561 SelectCode(N);
562 return;
563 }
564
565 auto *SVN = cast<ShuffleVectorSDNode>(N);
566
567 SDValue Src0 = SVN->getOperand(0);
568 SDValue Src1 = SVN->getOperand(1);
569 ArrayRef<int> Mask = SVN->getMask();
570 SDLoc DL(N);
571
572 assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
573 Mask[0] < 4 && Mask[1] < 4);
574
575 SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
576 SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
577 unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
578 unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
579
580 if (Mask[0] < 0) {
581 Src0SubReg = Src1SubReg;
582 MachineSDNode *ImpDef =
583 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
584 VSrc0 = SDValue(ImpDef, 0);
585 }
586
587 if (Mask[1] < 0) {
588 Src1SubReg = Src0SubReg;
589 MachineSDNode *ImpDef =
590 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
591 VSrc1 = SDValue(ImpDef, 0);
592 }
593
594 // SGPR case needs to lower to copies.
595 //
596 // Also use subregister extract when we can directly blend the registers with
597 // a simple subregister copy.
598 //
599 // TODO: Maybe we should fold this out earlier
600 if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
601 Src1SubReg == AMDGPU::sub0) {
602 // The low element of the result always comes from src0.
603 // The high element of the result always comes from src1.
604 // op_sel selects the high half of src0.
605 // op_sel_hi selects the high half of src1.
606
607 unsigned Src0OpSel =
608 Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
609 unsigned Src1OpSel =
610 Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
611
612 // Enable op_sel_hi to avoid printing it. This should have no effect on the
613 // result.
614 Src0OpSel |= SISrcMods::OP_SEL_1;
615 Src1OpSel |= SISrcMods::OP_SEL_1;
616
617 SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
618 SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
619 SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);
620
621 CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
622 {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
623 ZeroMods, // clamp
624 ZeroMods, // op_sel
625 ZeroMods, // op_sel_hi
626 ZeroMods, // neg_lo
627 ZeroMods}); // neg_hi
628 return;
629 }
630
631 SDValue ResultElt0 =
632 CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
633 SDValue ResultElt1 =
634 CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);
635
636 const SDValue Ops[] = {
637 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
638 ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
639 ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
640 CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
641}
642
644 unsigned int Opc = N->getOpcode();
645 if (N->isMachineOpcode()) {
646 N->setNodeId(-1);
647 return; // Already selected.
648 }
649
650 // isa<MemSDNode> almost works but is slightly too permissive for some DS
651 // intrinsics.
652 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
653 N = glueCopyToM0LDSInit(N);
654 SelectCode(N);
655 return;
656 }
657
658 switch (Opc) {
659 default:
660 break;
661 // We are selecting i64 ADD here instead of custom lower it during
662 // DAG legalization, so we can fold some i64 ADDs used for address
663 // calculation into the LOAD and STORE instructions.
664 case ISD::ADDC:
665 case ISD::ADDE:
666 case ISD::SUBC:
667 case ISD::SUBE: {
668 if (N->getValueType(0) != MVT::i64)
669 break;
670
671 SelectADD_SUB_I64(N);
672 return;
673 }
674 case ISD::UADDO_CARRY:
675 case ISD::USUBO_CARRY:
676 if (N->getValueType(0) != MVT::i32)
677 break;
678
679 SelectAddcSubb(N);
680 return;
681 case ISD::UADDO:
682 case ISD::USUBO: {
683 SelectUADDO_USUBO(N);
684 return;
685 }
687 SelectFMUL_W_CHAIN(N);
688 return;
689 }
691 SelectFMA_W_CHAIN(N);
692 return;
693 }
694
696 case ISD::BUILD_VECTOR: {
697 EVT VT = N->getValueType(0);
698 unsigned NumVectorElts = VT.getVectorNumElements();
699 if (VT.getScalarSizeInBits() == 16) {
700 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
701 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
702 ReplaceNode(N, Packed);
703 return;
704 }
705 }
706
707 break;
708 }
709
710 assert(VT.getVectorElementType().bitsEq(MVT::i32));
711 unsigned RegClassID =
712 SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
713 SelectBuildVector(N, RegClassID);
714 return;
715 }
718 return;
719 case ISD::BUILD_PAIR: {
720 SDValue RC, SubReg0, SubReg1;
721 SDLoc DL(N);
722 if (N->getValueType(0) == MVT::i128) {
723 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
724 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
725 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
726 } else if (N->getValueType(0) == MVT::i64) {
727 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
728 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
729 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
730 } else {
731 llvm_unreachable("Unhandled value type for BUILD_PAIR");
732 }
733 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
734 N->getOperand(1), SubReg1 };
735 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
736 N->getValueType(0), Ops));
737 return;
738 }
739
740 case ISD::Constant:
741 case ISD::ConstantFP: {
742 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) ||
743 Subtarget->has64BitLiterals())
744 break;
745
746 uint64_t Imm;
747 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
748 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
749 if (AMDGPU::isValid32BitLiteral(Imm, true))
750 break;
751 } else {
752 ConstantSDNode *C = cast<ConstantSDNode>(N);
753 Imm = C->getZExtValue();
754 if (AMDGPU::isValid32BitLiteral(Imm, false))
755 break;
756 }
757
758 SDLoc DL(N);
759 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
760 return;
761 }
763 case AMDGPUISD::BFE_U32: {
764 // There is a scalar version available, but unlike the vector version which
765 // has a separate operand for the offset and width, the scalar version packs
766 // the width and offset into a single operand. Try to move to the scalar
767 // version if the offsets are constant, so that we can try to keep extended
768 // loads of kernel arguments in SGPRs.
769
770 // TODO: Technically we could try to pattern match scalar bitshifts of
771 // dynamic values, but it's probably not useful.
772 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
773 if (!Offset)
774 break;
775
776 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
777 if (!Width)
778 break;
779
780 bool Signed = Opc == AMDGPUISD::BFE_I32;
781
782 uint32_t OffsetVal = Offset->getZExtValue();
783 uint32_t WidthVal = Width->getZExtValue();
784
785 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
786 WidthVal));
787 return;
788 }
790 SelectDIV_SCALE(N);
791 return;
792 }
795 SelectMAD_64_32(N);
796 return;
797 }
798 case ISD::SMUL_LOHI:
799 case ISD::UMUL_LOHI:
800 return SelectMUL_LOHI(N);
801 case ISD::CopyToReg: {
803 *static_cast<const SITargetLowering*>(getTargetLowering());
804 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
805 break;
806 }
807 case ISD::AND:
808 case ISD::SRL:
809 case ISD::SRA:
811 if (N->getValueType(0) != MVT::i32)
812 break;
813
814 SelectS_BFE(N);
815 return;
816 case ISD::BRCOND:
817 SelectBRCOND(N);
818 return;
819 case ISD::FP_EXTEND:
820 SelectFP_EXTEND(N);
821 return;
827 // Hack around using a legal type if f16 is illegal.
828 if (N->getValueType(0) == MVT::i32) {
829 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
830 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
831 { N->getOperand(0), N->getOperand(1) });
832 SelectCode(N);
833 return;
834 }
835
836 break;
837 }
839 SelectINTRINSIC_W_CHAIN(N);
840 return;
841 }
843 SelectINTRINSIC_WO_CHAIN(N);
844 return;
845 }
846 case ISD::INTRINSIC_VOID: {
847 SelectINTRINSIC_VOID(N);
848 return;
849 }
851 SelectWAVE_ADDRESS(N);
852 return;
853 }
854 case ISD::STACKRESTORE: {
855 SelectSTACKRESTORE(N);
856 return;
857 }
858 }
859
860 SelectCode(N);
861}
862
863bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
864 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
865 const Instruction *Term = BB->getTerminator();
866 return Term->getMetadata("amdgpu.uniform") ||
867 Term->getMetadata("structurizecfg.uniform");
868}
869
870bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
871 unsigned ShAmtBits) const {
872 assert(N->getOpcode() == ISD::AND);
873
874 const APInt &RHS = N->getConstantOperandAPInt(1);
875 if (RHS.countr_one() >= ShAmtBits)
876 return true;
877
878 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
879 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
880}
881
883 SDValue &N0, SDValue &N1) {
884 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
885 Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
886 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
887 // (i64 (bitcast (v2i32 (build_vector
888 // (or (extract_vector_elt V, 0), OFFSET),
889 // (extract_vector_elt V, 1)))))
890 SDValue Lo = Addr.getOperand(0).getOperand(0);
891 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
892 SDValue BaseLo = Lo.getOperand(0);
893 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
894 // Check that split base (Lo and Hi) are extracted from the same one.
895 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
897 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
898 // Lo is statically extracted from index 0.
899 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
900 BaseLo.getConstantOperandVal(1) == 0 &&
901 // Hi is statically extracted from index 0.
902 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
903 BaseHi.getConstantOperandVal(1) == 1) {
904 N0 = BaseLo.getOperand(0).getOperand(0);
905 N1 = Lo.getOperand(1);
906 return true;
907 }
908 }
909 }
910 return false;
911}
912
913bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
914 SDValue &RHS) const {
916 LHS = Addr.getOperand(0);
917 RHS = Addr.getOperand(1);
918 return true;
919 }
920
921 if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
922 assert(LHS && RHS && isa<ConstantSDNode>(RHS));
923 return true;
924 }
925
926 return false;
927}
928
930 return "AMDGPU DAG->DAG Pattern Instruction Selection";
931}
932
935 std::make_unique<AMDGPUDAGToDAGISel>(TM, TM.getOptLevel())) {}
936
940#ifdef EXPENSIVE_CHECKS
942 .getManager();
943 auto &F = MF.getFunction();
946 for (auto &L : LI.getLoopsInPreorder())
947 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
948#endif
949 return SelectionDAGISelPass::run(MF, MFAM);
950}
951
952//===----------------------------------------------------------------------===//
953// Complex Patterns
954//===----------------------------------------------------------------------===//
955
956bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
957 SDValue &Offset) {
958 return false;
959}
960
961bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
962 SDValue &Offset) {
964 SDLoc DL(Addr);
965
966 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
967 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
968 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
969 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
970 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
971 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
972 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
973 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
974 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
975 Base = Addr.getOperand(0);
976 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
977 } else {
978 Base = Addr;
979 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
980 }
981
982 return true;
983}
984
985SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
986 const SDLoc &DL) const {
988 AMDGPU::S_MOV_B32, DL, MVT::i32,
989 CurDAG->getTargetConstant(Val, DL, MVT::i32));
990 return SDValue(Mov, 0);
991}
992
993// FIXME: Should only handle uaddo_carry/usubo_carry
994void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
995 SDLoc DL(N);
996 SDValue LHS = N->getOperand(0);
997 SDValue RHS = N->getOperand(1);
998
999 unsigned Opcode = N->getOpcode();
1000 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
1001 bool ProduceCarry =
1002 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
1003 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
1004
1005 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1006 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1007
1008 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1009 DL, MVT::i32, LHS, Sub0);
1010 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1011 DL, MVT::i32, LHS, Sub1);
1012
1013 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1014 DL, MVT::i32, RHS, Sub0);
1015 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1016 DL, MVT::i32, RHS, Sub1);
1017
1018 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
1019
1020 static const unsigned OpcMap[2][2][2] = {
1021 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
1022 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
1023 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
1024 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
1025
1026 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
1027 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
1028
1029 SDNode *AddLo;
1030 if (!ConsumeCarry) {
1031 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
1032 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
1033 } else {
1034 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
1035 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1036 }
1037 SDValue AddHiArgs[] = {
1038 SDValue(Hi0, 0),
1039 SDValue(Hi1, 0),
1040 SDValue(AddLo, 1)
1041 };
1042 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
1043
1044 SDValue RegSequenceArgs[] = {
1045 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
1046 SDValue(AddLo,0),
1047 Sub0,
1048 SDValue(AddHi,0),
1049 Sub1,
1050 };
1051 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1052 MVT::i64, RegSequenceArgs);
1053
1054 if (ProduceCarry) {
1055 // Replace the carry-use
1056 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
1057 }
1058
1059 // Replace the remaining uses.
1060 ReplaceNode(N, RegSequence);
1061}
1062
1063void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1064 SDValue LHS = N->getOperand(0);
1065 SDValue RHS = N->getOperand(1);
1066 SDValue CI = N->getOperand(2);
1067
1068 if (N->isDivergent()) {
1069 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
1070 : AMDGPU::V_SUBB_U32_e64;
1072 N, Opc, N->getVTList(),
1073 {LHS, RHS, CI,
1074 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1075 } else {
1076 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
1077 : AMDGPU::S_SUB_CO_PSEUDO;
1078 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
1079 }
1080}
1081
1082void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1083 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1084 // carry out despite the _i32 name. These were renamed in VI to _U32.
1085 // FIXME: We should probably rename the opcodes here.
1086 bool IsAdd = N->getOpcode() == ISD::UADDO;
1087 bool IsVALU = N->isDivergent();
1088
1089 for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
1090 ++UI)
1091 if (UI.getUse().getResNo() == 1) {
1092 if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
1093 (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
1094 IsVALU = true;
1095 break;
1096 }
1097 }
1098
1099 if (IsVALU) {
1100 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1101
1103 N, Opc, N->getVTList(),
1104 {N->getOperand(0), N->getOperand(1),
1105 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1106 } else {
1107 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
1108 : AMDGPU::S_USUBO_PSEUDO;
1109
1110 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
1111 {N->getOperand(0), N->getOperand(1)});
1112 }
1113}
1114
1115void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1116 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1117 SDValue Ops[10];
1118
1119 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
1120 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1121 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
1122 Ops[8] = N->getOperand(0);
1123 Ops[9] = N->getOperand(4);
1124
1125 // If there are no source modifiers, prefer fmac over fma because it can use
1126 // the smaller VOP2 encoding.
1127 bool UseFMAC = Subtarget->hasDLInsts() &&
1128 cast<ConstantSDNode>(Ops[0])->isZero() &&
1129 cast<ConstantSDNode>(Ops[2])->isZero() &&
1130 cast<ConstantSDNode>(Ops[4])->isZero();
1131 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
1132 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
1133}
1134
1135void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1136 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1137 SDValue Ops[8];
1138
1139 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
1140 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1141 Ops[6] = N->getOperand(0);
1142 Ops[7] = N->getOperand(3);
1143
1144 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
1145}
1146
1147// We need to handle this here because tablegen doesn't support matching
1148// instructions with multiple outputs.
1149void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1150 EVT VT = N->getValueType(0);
1151
1152 assert(VT == MVT::f32 || VT == MVT::f64);
1153
1154 unsigned Opc
1155 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1156
1157 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1158 // omod
1159 SDValue Ops[8];
1160 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1161 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1162 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1163 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1164}
1165
1166// We need to handle this here because tablegen doesn't support matching
1167// instructions with multiple outputs.
1168void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1169 SDLoc SL(N);
1170 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1171 unsigned Opc;
1172 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1);
1173 if (Subtarget->hasMADIntraFwdBug())
1174 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1175 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1176 else if (UseNoCarry)
1177 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1178 else
1179 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1180
1181 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1182 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1183 Clamp };
1184
1185 if (UseNoCarry) {
1186 MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
1187 ReplaceUses(SDValue(N, 0), SDValue(Mad, 0));
1189 return;
1190 }
1191
1192 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1193}
1194
1195// We need to handle this here because tablegen doesn't support matching
1196// instructions with multiple outputs.
1197void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1198 SDLoc SL(N);
1199 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1200 SDVTList VTList;
1201 unsigned Opc;
1202 if (Subtarget->hasMadU64U32NoCarry()) {
1203 VTList = CurDAG->getVTList(MVT::i64);
1204 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1205 } else {
1206 VTList = CurDAG->getVTList(MVT::i64, MVT::i1);
1207 if (Subtarget->hasMADIntraFwdBug()) {
1208 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1209 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1210 } else {
1211 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1212 }
1213 }
1214
1215 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1216 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1217 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1218 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, VTList, Ops);
1219 if (!SDValue(N, 0).use_empty()) {
1220 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1221 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1222 MVT::i32, SDValue(Mad, 0), Sub0);
1223 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1224 }
1225 if (!SDValue(N, 1).use_empty()) {
1226 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1227 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1228 MVT::i32, SDValue(Mad, 0), Sub1);
1229 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1230 }
1232}
1233
1234bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1235 if (!isUInt<16>(Offset))
1236 return false;
1237
1238 if (!Base || Subtarget->hasUsableDSOffset() ||
1239 Subtarget->unsafeDSOffsetFoldingEnabled())
1240 return true;
1241
1242 // On Southern Islands instruction with a negative base value and an offset
1243 // don't seem to work.
1244 return CurDAG->SignBitIsZero(Base);
1245}
1246
1247bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1248 SDValue &Offset) const {
1249 SDLoc DL(Addr);
1251 SDValue N0 = Addr.getOperand(0);
1252 SDValue N1 = Addr.getOperand(1);
1253 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1254 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1255 // (add n0, c0)
1256 Base = N0;
1257 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1258 return true;
1259 }
1260 } else if (Addr.getOpcode() == ISD::SUB) {
1261 // sub C, x -> add (sub 0, x), C
1262 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1263 int64_t ByteOffset = C->getSExtValue();
1264 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1265 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1266
1267 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1268 // the known bits in isDSOffsetLegal. We need to emit the selected node
1269 // here, so this is thrown away.
1270 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1271 Zero, Addr.getOperand(1));
1272
1273 if (isDSOffsetLegal(Sub, ByteOffset)) {
1275 Opnds.push_back(Zero);
1276 Opnds.push_back(Addr.getOperand(1));
1277
1278 // FIXME: Select to VOP3 version for with-carry.
1279 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1280 if (Subtarget->hasAddNoCarry()) {
1281 SubOp = AMDGPU::V_SUB_U32_e64;
1282 Opnds.push_back(
1283 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1284 }
1285
1286 MachineSDNode *MachineSub =
1287 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1288
1289 Base = SDValue(MachineSub, 0);
1290 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1291 return true;
1292 }
1293 }
1294 }
1295 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1296 // If we have a constant address, prefer to put the constant into the
1297 // offset. This can save moves to load the constant address since multiple
1298 // operations can share the zero base address register, and enables merging
1299 // into read2 / write2 instructions.
1300
1301 SDLoc DL(Addr);
1302
1303 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1304 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1305 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1306 DL, MVT::i32, Zero);
1307 Base = SDValue(MovZero, 0);
1308 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1309 return true;
1310 }
1311 }
1312
1313 // default case
1314 Base = Addr;
1315 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1316 return true;
1317}
1318
1319bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1320 unsigned Offset1,
1321 unsigned Size) const {
1322 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1323 return false;
1324 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1325 return false;
1326
1327 if (!Base || Subtarget->hasUsableDSOffset() ||
1328 Subtarget->unsafeDSOffsetFoldingEnabled())
1329 return true;
1330
1331 // On Southern Islands instruction with a negative base value and an offset
1332 // don't seem to work.
1333 return CurDAG->SignBitIsZero(Base);
1334}
1335
1336// Return whether the operation has NoUnsignedWrap property.
1338 return (Addr.getOpcode() == ISD::ADD &&
1339 Addr->getFlags().hasNoUnsignedWrap()) ||
1340 Addr->getOpcode() == ISD::OR;
1341}
1342
1343// Check that the base address of flat scratch load/store in the form of `base +
1344// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1345// requirement). We always treat the first operand as the base address here.
1346bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1348 return true;
1349
1350 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1351 // values.
1352 if (Subtarget->hasSignedScratchOffsets())
1353 return true;
1354
1355 auto LHS = Addr.getOperand(0);
1356 auto RHS = Addr.getOperand(1);
1357
1358 // If the immediate offset is negative and within certain range, the base
1359 // address cannot also be negative. If the base is also negative, the sum
1360 // would be either negative or much larger than the valid range of scratch
1361 // memory a thread can access.
1362 ConstantSDNode *ImmOp = nullptr;
1363 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1364 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1365 return true;
1366 }
1367
1368 return CurDAG->SignBitIsZero(LHS);
1369}
1370
1371// Check address value in SGPR/VGPR are legal for flat scratch in the form
1372// of: SGPR + VGPR.
1373bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1375 return true;
1376
1377 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1378 // values.
1379 if (Subtarget->hasSignedScratchOffsets())
1380 return true;
1381
1382 auto LHS = Addr.getOperand(0);
1383 auto RHS = Addr.getOperand(1);
1384 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1385}
1386
1387// Check address value in SGPR/VGPR are legal for flat scratch in the form
1388// of: SGPR + VGPR + Imm.
1389bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1390 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1391 // values.
1392 if (AMDGPU::isGFX12Plus(*Subtarget))
1393 return true;
1394
1395 auto Base = Addr.getOperand(0);
1396 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1397 // If the immediate offset is negative and within certain range, the base
1398 // address cannot also be negative. If the base is also negative, the sum
1399 // would be either negative or much larger than the valid range of scratch
1400 // memory a thread can access.
1401 if (isNoUnsignedWrap(Base) &&
1403 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1404 return true;
1405
1406 auto LHS = Base.getOperand(0);
1407 auto RHS = Base.getOperand(1);
1408 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1409}
1410
1411// TODO: If offset is too big, put low 16-bit into offset.
1412bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1413 SDValue &Offset0,
1414 SDValue &Offset1) const {
1415 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1416}
1417
1418bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1419 SDValue &Offset0,
1420 SDValue &Offset1) const {
1421 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1422}
1423
1424bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1425 SDValue &Offset0, SDValue &Offset1,
1426 unsigned Size) const {
1427 SDLoc DL(Addr);
1428
1430 SDValue N0 = Addr.getOperand(0);
1431 SDValue N1 = Addr.getOperand(1);
1432 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1433 unsigned OffsetValue0 = C1->getZExtValue();
1434 unsigned OffsetValue1 = OffsetValue0 + Size;
1435
1436 // (add n0, c0)
1437 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1438 Base = N0;
1439 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1440 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1441 return true;
1442 }
1443 } else if (Addr.getOpcode() == ISD::SUB) {
1444 // sub C, x -> add (sub 0, x), C
1445 if (const ConstantSDNode *C =
1446 dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1447 unsigned OffsetValue0 = C->getZExtValue();
1448 unsigned OffsetValue1 = OffsetValue0 + Size;
1449
1450 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1451 SDLoc DL(Addr);
1452 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1453
1454 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1455 // the known bits in isDSOffsetLegal. We need to emit the selected node
1456 // here, so this is thrown away.
1457 SDValue Sub =
1458 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1459
1460 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1462 Opnds.push_back(Zero);
1463 Opnds.push_back(Addr.getOperand(1));
1464 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1465 if (Subtarget->hasAddNoCarry()) {
1466 SubOp = AMDGPU::V_SUB_U32_e64;
1467 Opnds.push_back(
1468 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1469 }
1470
1471 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1472 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1473
1474 Base = SDValue(MachineSub, 0);
1475 Offset0 =
1476 CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1477 Offset1 =
1478 CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1479 return true;
1480 }
1481 }
1482 }
1483 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1484 unsigned OffsetValue0 = CAddr->getZExtValue();
1485 unsigned OffsetValue1 = OffsetValue0 + Size;
1486
1487 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1488 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1489 MachineSDNode *MovZero =
1490 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1491 Base = SDValue(MovZero, 0);
1492 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1493 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1494 return true;
1495 }
1496 }
1497
1498 // default case
1499
1500 Base = Addr;
1501 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
1502 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
1503 return true;
1504}
1505
1506bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1507 SDValue &SOffset, SDValue &Offset,
1508 SDValue &Offen, SDValue &Idxen,
1509 SDValue &Addr64) const {
1510 // Subtarget prefers to use flat instruction
1511 // FIXME: This should be a pattern predicate and not reach here
1512 if (Subtarget->useFlatForGlobal())
1513 return false;
1514
1515 SDLoc DL(Addr);
1516
1517 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1518 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1519 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1520 SOffset = Subtarget->hasRestrictedSOffset()
1521 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1522 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1523
1524 ConstantSDNode *C1 = nullptr;
1525 SDValue N0 = Addr;
1527 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1528 if (isUInt<32>(C1->getZExtValue()))
1529 N0 = Addr.getOperand(0);
1530 else
1531 C1 = nullptr;
1532 }
1533
1534 if (N0.getOpcode() == ISD::ADD) {
1535 // (add N2, N3) -> addr64, or
1536 // (add (add N2, N3), C1) -> addr64
1537 SDValue N2 = N0.getOperand(0);
1538 SDValue N3 = N0.getOperand(1);
1539 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1540
1541 if (N2->isDivergent()) {
1542 if (N3->isDivergent()) {
1543 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1544 // addr64, and construct the resource from a 0 address.
1545 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1546 VAddr = N0;
1547 } else {
1548 // N2 is divergent, N3 is not.
1549 Ptr = N3;
1550 VAddr = N2;
1551 }
1552 } else {
1553 // N2 is not divergent.
1554 Ptr = N2;
1555 VAddr = N3;
1556 }
1557 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1558 } else if (N0->isDivergent()) {
1559 // N0 is divergent. Use it as the addr64, and construct the resource from a
1560 // 0 address.
1561 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1562 VAddr = N0;
1563 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1564 } else {
1565 // N0 -> offset, or
1566 // (N0 + C1) -> offset
1567 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1568 Ptr = N0;
1569 }
1570
1571 if (!C1) {
1572 // No offset.
1573 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1574 return true;
1575 }
1576
1577 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1578 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1579 // Legal offset for instruction.
1580 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1581 return true;
1582 }
1583
1584 // Illegal offset, store it in soffset.
1585 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1586 SOffset =
1588 AMDGPU::S_MOV_B32, DL, MVT::i32,
1589 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1590 0);
1591 return true;
1592}
1593
1594bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1595 SDValue &VAddr, SDValue &SOffset,
1596 SDValue &Offset) const {
1597 SDValue Ptr, Offen, Idxen, Addr64;
1598
1599 // addr64 bit was removed for volcanic islands.
1600 // FIXME: This should be a pattern predicate and not reach here
1601 if (!Subtarget->hasAddr64())
1602 return false;
1603
1604 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1605 return false;
1606
1607 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1608 if (C->getSExtValue()) {
1609 SDLoc DL(Addr);
1610
1611 const SITargetLowering& Lowering =
1612 *static_cast<const SITargetLowering*>(getTargetLowering());
1613
1614 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1615 return true;
1616 }
1617
1618 return false;
1619}
1620
1621std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1622 SDLoc DL(N);
1623
1624 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1625 SDValue TFI =
1626 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1627
1628 // We rebase the base address into an absolute stack address and hence
1629 // use constant 0 for soffset. This value must be retained until
1630 // frame elimination and eliminateFrameIndex will choose the appropriate
1631 // frame register if need be.
1632 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1633}
1634
1635bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1636 SDValue Addr, SDValue &Rsrc,
1637 SDValue &VAddr, SDValue &SOffset,
1638 SDValue &ImmOffset) const {
1639
1640 SDLoc DL(Addr);
1643
1644 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1645
1646 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1647 int64_t Imm = CAddr->getSExtValue();
1648 const int64_t NullPtr =
1650 // Don't fold null pointer.
1651 if (Imm != NullPtr) {
1652 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1653 SDValue HighBits =
1654 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1655 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1656 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1657 VAddr = SDValue(MovHighBits, 0);
1658
1659 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1660 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1661 return true;
1662 }
1663 }
1664
1666 // (add n0, c1)
1667
1668 SDValue N0 = Addr.getOperand(0);
1669 uint64_t C1 = Addr.getConstantOperandVal(1);
1670
1671 // Offsets in vaddr must be positive if range checking is enabled.
1672 //
1673 // The total computation of vaddr + soffset + offset must not overflow. If
1674 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1675 // overflowing.
1676 //
1677 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1678 // always perform a range check. If a negative vaddr base index was used,
1679 // this would fail the range check. The overall address computation would
1680 // compute a valid address, but this doesn't happen due to the range
1681 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1682 //
1683 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1684 // MUBUF vaddr, but not on older subtargets which can only do this if the
1685 // sign bit is known 0.
1686 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1687 if (TII->isLegalMUBUFImmOffset(C1) &&
1688 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1689 CurDAG->SignBitIsZero(N0))) {
1690 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1691 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1692 return true;
1693 }
1694 }
1695
1696 // (node)
1697 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1698 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1699 return true;
1700}
1701
1702static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1703 if (Val.getOpcode() != ISD::CopyFromReg)
1704 return false;
1705 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1706 if (!Reg.isPhysical())
1707 return false;
1708 const auto *RC = TRI.getPhysRegBaseClass(Reg);
1709 return RC && TRI.isSGPRClass(RC);
1710}
1711
1712bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1713 SDValue Addr,
1714 SDValue &SRsrc,
1715 SDValue &SOffset,
1716 SDValue &Offset) const {
1717 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1718 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1721 SDLoc DL(Addr);
1722
1723 // CopyFromReg <sgpr>
1724 if (IsCopyFromSGPR(*TRI, Addr)) {
1725 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1726 SOffset = Addr;
1727 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1728 return true;
1729 }
1730
1731 ConstantSDNode *CAddr;
1732 if (Addr.getOpcode() == ISD::ADD) {
1733 // Add (CopyFromReg <sgpr>) <constant>
1734 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1735 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1736 return false;
1737 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1738 return false;
1739
1740 SOffset = Addr.getOperand(0);
1741 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1742 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1743 // <constant>
1744 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1745 } else {
1746 return false;
1747 }
1748
1749 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1750
1751 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1752 return true;
1753}
1754
1755bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1756 SDValue &SOffset, SDValue &Offset
1757 ) const {
1758 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1759 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1760
1761 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1762 return false;
1763
1764 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1765 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1766 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1767 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1768 maskTrailingOnes<uint64_t>(32); // Size
1769 SDLoc DL(Addr);
1770
1771 const SITargetLowering& Lowering =
1772 *static_cast<const SITargetLowering*>(getTargetLowering());
1773
1774 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1775 return true;
1776 }
1777 return false;
1778}
1779
1780bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1781 SDValue &SOffset) const {
1782 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1783 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1784 return true;
1785 }
1786
1787 SOffset = ByteOffsetNode;
1788 return true;
1789}
1790
1791// Find a load or store from corresponding pattern root.
1792// Roots may be build_vector, bitconvert or their combinations.
1795 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1796 return MN;
1797 assert(isa<BuildVectorSDNode>(N));
1798 for (SDValue V : N->op_values())
1799 if (MemSDNode *MN =
1800 dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1801 return MN;
1802 llvm_unreachable("cannot find MemSDNode in the pattern!");
1803}
1804
1805bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1806 SDValue &VAddr, SDValue &Offset,
1807 uint64_t FlatVariant) const {
1808 int64_t OffsetVal = 0;
1809
1810 unsigned AS = findMemSDNode(N)->getAddressSpace();
1811
1812 bool CanHaveFlatSegmentOffsetBug =
1813 Subtarget->hasFlatSegmentOffsetBug() &&
1814 FlatVariant == SIInstrFlags::FLAT &&
1816
1817 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1818 SDValue N0, N1;
1819 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1820 (FlatVariant != SIInstrFlags::FlatScratch ||
1821 isFlatScratchBaseLegal(Addr))) {
1822 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1823
1824 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1825 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1826 Addr = N0;
1827 OffsetVal = COffsetVal;
1828 } else {
1829 // If the offset doesn't fit, put the low bits into the offset field and
1830 // add the rest.
1831 //
1832 // For a FLAT instruction the hardware decides whether to access
1833 // global/scratch/shared memory based on the high bits of vaddr,
1834 // ignoring the offset field, so we have to ensure that when we add
1835 // remainder to vaddr it still points into the same underlying object.
1836 // The easiest way to do that is to make sure that we split the offset
1837 // into two pieces that are both >= 0 or both <= 0.
1838
1839 SDLoc DL(N);
1840 uint64_t RemainderOffset;
1841
1842 std::tie(OffsetVal, RemainderOffset) =
1843 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1844
1845 SDValue AddOffsetLo =
1846 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1847 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1848
1849 if (Addr.getValueType().getSizeInBits() == 32) {
1851 Opnds.push_back(N0);
1852 Opnds.push_back(AddOffsetLo);
1853 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1854 if (Subtarget->hasAddNoCarry()) {
1855 AddOp = AMDGPU::V_ADD_U32_e64;
1856 Opnds.push_back(Clamp);
1857 }
1858 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1859 } else {
1860 // TODO: Should this try to use a scalar add pseudo if the base address
1861 // is uniform and saddr is usable?
1862 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1863 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1864
1865 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1866 DL, MVT::i32, N0, Sub0);
1867 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1868 DL, MVT::i32, N0, Sub1);
1869
1870 SDValue AddOffsetHi =
1871 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1872
1873 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1874
1875 SDNode *Add =
1876 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1877 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1878
1879 SDNode *Addc = CurDAG->getMachineNode(
1880 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1881 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1882
1883 SDValue RegSequenceArgs[] = {
1884 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1885 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1886
1887 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1888 MVT::i64, RegSequenceArgs),
1889 0);
1890 }
1891 }
1892 }
1893 }
1894
1895 VAddr = Addr;
1896 Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1897 return true;
1898}
1899
1900bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1901 SDValue &VAddr,
1902 SDValue &Offset) const {
1903 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1904}
1905
1906bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1907 SDValue &VAddr,
1908 SDValue &Offset) const {
1909 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1910}
1911
1912bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1913 SDValue &VAddr,
1914 SDValue &Offset) const {
1915 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1917}
1918
1919// If this matches *_extend i32:x, return x
1920// Otherwise if the value is I32 returns x.
1922 const SelectionDAG *DAG) {
1923 if (Op.getValueType() == MVT::i32)
1924 return Op;
1925
1926 if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
1927 Op.getOpcode() != ISD::ANY_EXTEND &&
1928 !(DAG->SignBitIsZero(Op) &&
1929 Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
1930 return SDValue();
1931
1932 SDValue ExtSrc = Op.getOperand(0);
1933 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1934}
1935
1936// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1937// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
1938bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
1939 SDValue &SAddr, SDValue &VOffset,
1940 SDValue &Offset, bool &ScaleOffset,
1941 bool NeedIOffset) const {
1942 int64_t ImmOffset = 0;
1943 ScaleOffset = false;
1944
1945 // Match the immediate offset first, which canonically is moved as low as
1946 // possible.
1947
1948 SDValue LHS, RHS;
1949 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1950 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1951 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1952
1953 if (NeedIOffset &&
1954 TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1956 Addr = LHS;
1957 ImmOffset = COffsetVal;
1958 } else if (!LHS->isDivergent()) {
1959 if (COffsetVal > 0) {
1960 SDLoc SL(N);
1961 // saddr + large_offset -> saddr +
1962 // (voffset = large_offset & ~MaxOffset) +
1963 // (large_offset & MaxOffset);
1964 int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;
1965 if (NeedIOffset) {
1966 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1968 }
1969
1970 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
1971 : isUInt<32>(RemainderOffset)) {
1972 SDNode *VMov = CurDAG->getMachineNode(
1973 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1974 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1975 VOffset = SDValue(VMov, 0);
1976 SAddr = LHS;
1977 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1978 return true;
1979 }
1980 }
1981
1982 // We are adding a 64 bit SGPR and a constant. If constant bus limit
1983 // is 1 we would need to perform 1 or 2 extra moves for each half of
1984 // the constant and it is better to do a scalar add and then issue a
1985 // single VALU instruction to materialize zero. Otherwise it is less
1986 // instructions to perform VALU adds with immediates or inline literals.
1987 unsigned NumLiterals =
1988 !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) +
1989 !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal)));
1990 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1991 return false;
1992 }
1993 }
1994
1995 // Match the variable offset.
1996 if (Addr.getOpcode() == ISD::ADD) {
1997 LHS = Addr.getOperand(0);
1998
1999 if (!LHS->isDivergent()) {
2000 // add (i64 sgpr), (*_extend (i32 vgpr))
2001 RHS = Addr.getOperand(1);
2002 ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());
2003 if (SDValue ExtRHS = matchExtFromI32orI32(
2004 RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2005 SAddr = LHS;
2006 VOffset = ExtRHS;
2007 }
2008 }
2009
2010 RHS = Addr.getOperand(1);
2011 if (!SAddr && !RHS->isDivergent()) {
2012 // add (*_extend (i32 vgpr)), (i64 sgpr)
2013 ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset());
2014 if (SDValue ExtLHS = matchExtFromI32orI32(
2015 LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2016 SAddr = RHS;
2017 VOffset = ExtLHS;
2018 }
2019 }
2020
2021 if (SAddr) {
2022 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2023 return true;
2024 }
2025 }
2026
2027 if (Subtarget->hasScaleOffset() &&
2028 (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
2031 (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
2032 CurDAG->SignBitIsZero(Addr.getOperand(0)))) &&
2033 Addr.getOperand(0)->isDivergent() &&
2034 isa<ConstantSDNode>(Addr.getOperand(1)) &&
2035 !Addr.getOperand(2)->isDivergent()) {
2036 // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
2037 unsigned Size =
2038 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2039 ScaleOffset = Addr.getConstantOperandVal(1) == Size;
2040 if (ScaleOffset) {
2041 SAddr = Addr.getOperand(2);
2042 VOffset = Addr.getOperand(0);
2043 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2044 return true;
2045 }
2046 }
2047
2048 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
2049 isa<ConstantSDNode>(Addr))
2050 return false;
2051
2052 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
2053 // moves required to copy a 64-bit SGPR to VGPR.
2054 SAddr = Addr;
2055 SDNode *VMov =
2056 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
2057 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
2058 VOffset = SDValue(VMov, 0);
2059 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2060 return true;
2061}
2062
2063bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
2064 SDValue &SAddr, SDValue &VOffset,
2065 SDValue &Offset,
2066 SDValue &CPol) const {
2067 bool ScaleOffset;
2068 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2069 return false;
2070
2071 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2072 SDLoc(), MVT::i32);
2073 return true;
2074}
2075
2076bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
2077 SDValue &SAddr, SDValue &VOffset,
2078 SDValue &Offset,
2079 SDValue &CPol) const {
2080 bool ScaleOffset;
2081 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2082 return false;
2083
2084 // We are assuming CPol is always the last operand of the intrinsic.
2085 auto PassedCPol =
2086 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2088 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2089 return true;
2090}
2091
2092bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
2093 SDValue &SAddr, SDValue &VOffset,
2094 SDValue &Offset,
2095 SDValue &CPol) const {
2096 bool ScaleOffset;
2097 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2098 return false;
2099
2100 unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;
2101 CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);
2102 return true;
2103}
2104
2105bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
2106 SDValue &SAddr,
2107 SDValue &VOffset,
2108 SDValue &CPol) const {
2109 bool ScaleOffset;
2110 SDValue DummyOffset;
2111 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2112 false))
2113 return false;
2114
2115 // We are assuming CPol is always the last operand of the intrinsic.
2116 auto PassedCPol =
2117 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2119 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2120 return true;
2121}
2122
2124 if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
2125 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
2126 } else if (SAddr.getOpcode() == ISD::ADD &&
2127 isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
2128 // Materialize this into a scalar move for scalar address to avoid
2129 // readfirstlane.
2130 auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
2131 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
2132 FI->getValueType(0));
2133 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
2134 MVT::i32, TFI, SAddr.getOperand(1)),
2135 0);
2136 }
2137
2138 return SAddr;
2139}
2140
2141// Match (32-bit SGPR base) + sext(imm offset)
2142bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
2143 SDValue &SAddr,
2144 SDValue &Offset) const {
2145 if (Addr->isDivergent())
2146 return false;
2147
2148 SDLoc DL(Addr);
2149
2150 int64_t COffsetVal = 0;
2151
2152 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
2153 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
2154 SAddr = Addr.getOperand(0);
2155 } else {
2156 SAddr = Addr;
2157 }
2158
2159 SAddr = SelectSAddrFI(CurDAG, SAddr);
2160
2161 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2162
2163 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2165 int64_t SplitImmOffset, RemainderOffset;
2166 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2168
2169 COffsetVal = SplitImmOffset;
2170
2171 SDValue AddOffset =
2173 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
2174 : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);
2175 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
2176 SAddr, AddOffset),
2177 0);
2178 }
2179
2180 Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);
2181
2182 return true;
2183}
2184
2185// Check whether the flat scratch SVS swizzle bug affects this access.
2186bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
2187 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
2188 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
2189 return false;
2190
2191 // The bug affects the swizzling of SVS accesses if there is any carry out
2192 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
2193 // voffset to (soffset + inst_offset).
2194 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
2195 KnownBits SKnown =
2197 KnownBits::makeConstant(APInt(32, ImmOffset,
2198 /*isSigned=*/true)));
2199 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
2201 return (VMax & 3) + (SMax & 3) >= 4;
2202}
2203
2204bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
2205 SDValue &VAddr, SDValue &SAddr,
2206 SDValue &Offset,
2207 SDValue &CPol) const {
2208 int64_t ImmOffset = 0;
2209
2210 SDValue LHS, RHS;
2211 SDValue OrigAddr = Addr;
2212 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2213 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2214 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2215
2216 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2218 Addr = LHS;
2219 ImmOffset = COffsetVal;
2220 } else if (!LHS->isDivergent() && COffsetVal > 0) {
2221 SDLoc SL(N);
2222 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
2223 // (large_offset & MaxOffset);
2224 int64_t SplitImmOffset, RemainderOffset;
2225 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2227
2228 if (isUInt<32>(RemainderOffset)) {
2229 SDNode *VMov = CurDAG->getMachineNode(
2230 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2231 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2232 VAddr = SDValue(VMov, 0);
2233 SAddr = LHS;
2234 if (!isFlatScratchBaseLegal(Addr))
2235 return false;
2236 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
2237 return false;
2238 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2239 CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2240 return true;
2241 }
2242 }
2243 }
2244
2245 if (Addr.getOpcode() != ISD::ADD)
2246 return false;
2247
2248 LHS = Addr.getOperand(0);
2249 RHS = Addr.getOperand(1);
2250
2251 if (!LHS->isDivergent() && RHS->isDivergent()) {
2252 SAddr = LHS;
2253 VAddr = RHS;
2254 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
2255 SAddr = RHS;
2256 VAddr = LHS;
2257 } else {
2258 return false;
2259 }
2260
2261 if (OrigAddr != Addr) {
2262 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
2263 return false;
2264 } else {
2265 if (!isFlatScratchBaseLegalSV(OrigAddr))
2266 return false;
2267 }
2268
2269 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2270 return false;
2271 SAddr = SelectSAddrFI(CurDAG, SAddr);
2272 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2273
2274 bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */);
2275 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2276 SDLoc(), MVT::i32);
2277 return true;
2278}
2279
2280// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2281// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2282// Handle the case where the Immediate Offset + SOffset is negative.
2283bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2284 bool Imm32Only,
2285 bool IsBuffer,
2286 int64_t ImmOffset) const {
2287 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2288 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2289 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2290 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2291 return false;
2292 }
2293
2294 return true;
2295}
2296
2297// Given \p Offset and load node \p N check if an \p Offset is a multiple of
2298// the load byte size. If it is update \p Offset to a pre-scaled value and
2299// return true.
2300bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
2301 bool IsSigned) const {
2302 bool ScaleOffset = false;
2303 if (!Subtarget->hasScaleOffset() || !Offset)
2304 return false;
2305
2306 unsigned Size =
2307 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2308
2309 SDValue Off = Offset;
2310 if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG))
2311 Off = Ext;
2312
2313 if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) {
2314 if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1)))
2315 ScaleOffset = C->getZExtValue() == Log2_32(Size);
2316 } else if (Offset.getOpcode() == ISD::MUL ||
2317 (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||
2318 Offset.getOpcode() == AMDGPUISD::MUL_U24 ||
2319 (Offset.isMachineOpcode() &&
2320 Offset.getMachineOpcode() ==
2321 (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
2322 : AMDGPU::S_MUL_U64_U32_PSEUDO))) {
2323 if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1)))
2324 ScaleOffset = C->getZExtValue() == Size;
2325 }
2326
2327 if (ScaleOffset)
2328 Offset = Off.getOperand(0);
2329
2330 return ScaleOffset;
2331}
2332
2333// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2334// not null) offset. If Imm32Only is true, match only 32-bit immediate
2335// offsets available on CI.
2336bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
2337 SDValue *SOffset, SDValue *Offset,
2338 bool Imm32Only, bool IsBuffer,
2339 bool HasSOffset, int64_t ImmOffset,
2340 bool *ScaleOffset) const {
2341 assert((!SOffset || !Offset) &&
2342 "Cannot match both soffset and offset at the same time!");
2343
2344 if (ScaleOffset) {
2345 assert(N && SOffset);
2346
2347 *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */);
2348 }
2349
2350 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2351 if (!C) {
2352 if (!SOffset)
2353 return false;
2354
2355 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2356 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2357 *SOffset = ByteOffsetNode;
2358 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2359 ImmOffset);
2360 }
2361 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2362 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2363 *SOffset = ByteOffsetNode.getOperand(0);
2364 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2365 ImmOffset);
2366 }
2367 }
2368 return false;
2369 }
2370
2371 SDLoc SL(ByteOffsetNode);
2372
2373 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2374 // offset for S_BUFFER instructions is unsigned.
2375 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2376 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2377 *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2378 if (EncodedOffset && Offset && !Imm32Only) {
2379 *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);
2380 return true;
2381 }
2382
2383 // SGPR and literal offsets are unsigned.
2384 if (ByteOffset < 0)
2385 return false;
2386
2387 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2388 if (EncodedOffset && Offset && Imm32Only) {
2389 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2390 return true;
2391 }
2392
2393 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2394 return false;
2395
2396 if (SOffset) {
2397 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2398 *SOffset = SDValue(
2399 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2400 return true;
2401 }
2402
2403 return false;
2404}
2405
2406SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2407 if (Addr.getValueType() != MVT::i32)
2408 return Addr;
2409
2410 // Zero-extend a 32-bit address.
2411 SDLoc SL(Addr);
2412
2415 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2416 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2417
2418 const SDValue Ops[] = {
2419 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2420 Addr,
2421 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2422 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2423 0),
2424 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2425 };
2426
2427 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2428 Ops), 0);
2429}
2430
2431// Match a base and an immediate (if Offset is not null) or an SGPR (if
2432// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2433// true, match only 32-bit immediate offsets available on CI.
2434bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
2435 SDValue &SBase, SDValue *SOffset,
2436 SDValue *Offset, bool Imm32Only,
2437 bool IsBuffer, bool HasSOffset,
2438 int64_t ImmOffset,
2439 bool *ScaleOffset) const {
2440 if (SOffset && Offset) {
2441 assert(!Imm32Only && !IsBuffer);
2442 SDValue B;
2443
2444 if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true))
2445 return false;
2446
2447 int64_t ImmOff = 0;
2448 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2449 ImmOff = C->getSExtValue();
2450
2451 return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false,
2452 true, ImmOff, ScaleOffset);
2453 }
2454
2455 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2456 // wraparound, because s_load instructions perform the addition in 64 bits.
2457 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2458 !Addr->getFlags().hasNoUnsignedWrap())
2459 return false;
2460
2461 SDValue N0, N1;
2462 // Extract the base and offset if possible.
2463 if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2464 N0 = Addr.getOperand(0);
2465 N1 = Addr.getOperand(1);
2466 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2467 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2468 }
2469 if (!N0 || !N1)
2470 return false;
2471
2472 if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2473 ImmOffset, ScaleOffset)) {
2474 SBase = N0;
2475 return true;
2476 }
2477 if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2478 ImmOffset, ScaleOffset)) {
2479 SBase = N1;
2480 return true;
2481 }
2482 return false;
2483}
2484
2485bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
2486 SDValue *SOffset, SDValue *Offset,
2487 bool Imm32Only, bool *ScaleOffset) const {
2488 if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
2489 /* IsBuffer */ false, /* HasSOffset */ false,
2490 /* ImmOffset */ 0, ScaleOffset)) {
2491 SBase = Expand32BitAddress(SBase);
2492 return true;
2493 }
2494
2495 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2496 SBase = Expand32BitAddress(Addr);
2497 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2498 return true;
2499 }
2500
2501 return false;
2502}
2503
2504bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2505 SDValue &Offset) const {
2506 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2507 &Offset);
2508}
2509
2510bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2511 SDValue &Offset) const {
2513 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2514 &Offset, /* Imm32Only */ true);
2515}
2516
2517bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
2518 SDValue &SOffset, SDValue &CPol) const {
2519 bool ScaleOffset;
2520 if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr,
2521 /* Imm32Only */ false, &ScaleOffset))
2522 return false;
2523
2524 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2525 SDLoc(N), MVT::i32);
2526 return true;
2527}
2528
2529bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
2530 SDValue &SBase, SDValue &SOffset,
2531 SDValue &Offset,
2532 SDValue &CPol) const {
2533 bool ScaleOffset;
2534 if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset))
2535 return false;
2536
2537 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2538 SDLoc(N), MVT::i32);
2539 return true;
2540}
2541
2542bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2543 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2544 /* Imm32Only */ false, /* IsBuffer */ true);
2545}
2546
2547bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2548 SDValue &Offset) const {
2550 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2551 /* Imm32Only */ true, /* IsBuffer */ true);
2552}
2553
2554bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2555 SDValue &Offset) const {
2556 // Match the (soffset + offset) pair as a 32-bit register base and
2557 // an immediate offset.
2558 return N.getValueType() == MVT::i32 &&
2559 SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset,
2560 /* SOffset*/ nullptr, &Offset,
2561 /* Imm32Only */ false, /* IsBuffer */ true);
2562}
2563
2564bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2565 SDValue &Base,
2566 SDValue &Offset) const {
2567 SDLoc DL(Index);
2568
2569 if (CurDAG->isBaseWithConstantOffset(Index)) {
2570 SDValue N0 = Index.getOperand(0);
2571 SDValue N1 = Index.getOperand(1);
2572 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2573
2574 // (add n0, c0)
2575 // Don't peel off the offset (c0) if doing so could possibly lead
2576 // the base (n0) to be negative.
2577 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2578 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2579 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2580 Base = N0;
2581 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2582 return true;
2583 }
2584 }
2585
2586 if (isa<ConstantSDNode>(Index))
2587 return false;
2588
2589 Base = Index;
2590 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2591 return true;
2592}
2593
2594SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2595 SDValue Val, uint32_t Offset,
2596 uint32_t Width) {
2597 if (Val->isDivergent()) {
2598 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2600 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2601
2602 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2603 }
2604 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2605 // Transformation function, pack the offset and width of a BFE into
2606 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2607 // source, bits [5:0] contain the offset and bits [22:16] the width.
2608 uint32_t PackedVal = Offset | (Width << 16);
2609 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2610
2611 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2612}
2613
2614void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2615 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2616 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2617 // Predicate: 0 < b <= c < 32
2618
2619 const SDValue &Shl = N->getOperand(0);
2620 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2621 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2622
2623 if (B && C) {
2624 uint32_t BVal = B->getZExtValue();
2625 uint32_t CVal = C->getZExtValue();
2626
2627 if (0 < BVal && BVal <= CVal && CVal < 32) {
2628 bool Signed = N->getOpcode() == ISD::SRA;
2629 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2630 32 - CVal));
2631 return;
2632 }
2633 }
2634 SelectCode(N);
2635}
2636
2637void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2638 switch (N->getOpcode()) {
2639 case ISD::AND:
2640 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2641 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2642 // Predicate: isMask(mask)
2643 const SDValue &Srl = N->getOperand(0);
2644 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2645 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2646
2647 if (Shift && Mask) {
2648 uint32_t ShiftVal = Shift->getZExtValue();
2649 uint32_t MaskVal = Mask->getZExtValue();
2650
2651 if (isMask_32(MaskVal)) {
2652 uint32_t WidthVal = llvm::popcount(MaskVal);
2653 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2654 WidthVal));
2655 return;
2656 }
2657 }
2658 }
2659 break;
2660 case ISD::SRL:
2661 if (N->getOperand(0).getOpcode() == ISD::AND) {
2662 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2663 // Predicate: isMask(mask >> b)
2664 const SDValue &And = N->getOperand(0);
2665 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2666 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2667
2668 if (Shift && Mask) {
2669 uint32_t ShiftVal = Shift->getZExtValue();
2670 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2671
2672 if (isMask_32(MaskVal)) {
2673 uint32_t WidthVal = llvm::popcount(MaskVal);
2674 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2675 WidthVal));
2676 return;
2677 }
2678 }
2679 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2680 SelectS_BFEFromShifts(N);
2681 return;
2682 }
2683 break;
2684 case ISD::SRA:
2685 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2686 SelectS_BFEFromShifts(N);
2687 return;
2688 }
2689 break;
2690
2692 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2693 SDValue Src = N->getOperand(0);
2694 if (Src.getOpcode() != ISD::SRL)
2695 break;
2696
2697 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2698 if (!Amt)
2699 break;
2700
2701 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2702 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2703 Amt->getZExtValue(), Width));
2704 return;
2705 }
2706 }
2707
2708 SelectCode(N);
2709}
2710
2711bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2712 assert(N->getOpcode() == ISD::BRCOND);
2713 if (!N->hasOneUse())
2714 return false;
2715
2716 SDValue Cond = N->getOperand(1);
2717 if (Cond.getOpcode() == ISD::CopyToReg)
2718 Cond = Cond.getOperand(2);
2719
2720 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2721 return false;
2722
2723 MVT VT = Cond.getOperand(0).getSimpleValueType();
2724 if (VT == MVT::i32)
2725 return true;
2726
2727 if (VT == MVT::i64) {
2728 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2729 return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2730 Subtarget->hasScalarCompareEq64();
2731 }
2732
2733 if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2734 return true;
2735
2736 return false;
2737}
2738
2739static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2740 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2741 // Special case for amdgcn.ballot:
2742 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2743 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2744 // =>
2745 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2746 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2747 // Cond becomes a i(WaveSize) full mask value.
2748 // Note that ballot doesn't use SETEQ condition but its easy to support it
2749 // here for completeness, so in this case Negate is set true on return.
2750 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2751 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2752 isNullConstant(VCMP.getOperand(1))) {
2753
2754 auto Cond = VCMP.getOperand(0);
2755 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2756 Cond = Cond.getOperand(0);
2757
2758 if (isBoolSGPR(Cond)) {
2759 Negate = VCMP_CC == ISD::SETEQ;
2760 return Cond;
2761 }
2762 }
2763 return SDValue();
2764}
2765
2766void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2767 SDValue Cond = N->getOperand(1);
2768
2769 if (Cond.isUndef()) {
2770 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2771 N->getOperand(2), N->getOperand(0));
2772 return;
2773 }
2774
2775 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2776
2777 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2778 bool AndExec = !UseSCCBr;
2779 bool Negate = false;
2780
2781 if (Cond.getOpcode() == ISD::SETCC &&
2782 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2783 SDValue VCMP = Cond->getOperand(0);
2784 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2785 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2786 isNullConstant(Cond->getOperand(1)) &&
2787 // We may encounter ballot.i64 in wave32 mode on -O0.
2788 VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2789 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2790 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2791 // BRCOND i1 %C, %BB
2792 // =>
2793 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2794 // VCC = COPY i(WaveSize) %VCMP
2795 // S_CBRANCH_VCCNZ/VCCZ %BB
2796 Negate = CC == ISD::SETEQ;
2797 bool NegatedBallot = false;
2798 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2799 Cond = BallotCond;
2800 UseSCCBr = !BallotCond->isDivergent();
2801 Negate = Negate ^ NegatedBallot;
2802 } else {
2803 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2804 // selected as V_CMP, but this may change for uniform condition.
2805 Cond = VCMP;
2806 UseSCCBr = false;
2807 }
2808 }
2809 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2810 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2811 // used.
2812 AndExec = false;
2813 }
2814
2815 unsigned BrOp =
2816 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2817 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2818 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2819 SDLoc SL(N);
2820
2821 if (AndExec) {
2822 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2823 // analyzed what generates the vcc value, so we do not know whether vcc
2824 // bits for disabled lanes are 0. Thus we need to mask out bits for
2825 // disabled lanes.
2826 //
2827 // For the case that we select S_CBRANCH_SCC1 and it gets
2828 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2829 // SIInstrInfo::moveToVALU which inserts the S_AND).
2830 //
2831 // We could add an analysis of what generates the vcc value here and omit
2832 // the S_AND when is unnecessary. But it would be better to add a separate
2833 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2834 // catches both cases.
2835 Cond = SDValue(
2837 Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,
2838 MVT::i1,
2839 CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO
2840 : AMDGPU::EXEC,
2841 MVT::i1),
2842 Cond),
2843 0);
2844 }
2845
2846 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2847 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2848 N->getOperand(2), // Basic Block
2849 VCC.getValue(0));
2850}
2851
2852void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2853 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2854 !N->isDivergent()) {
2855 SDValue Src = N->getOperand(0);
2856 if (Src.getValueType() == MVT::f16) {
2857 if (isExtractHiElt(Src, Src)) {
2858 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2859 {Src});
2860 return;
2861 }
2862 }
2863 }
2864
2865 SelectCode(N);
2866}
2867
2868void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2869 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2870 // be copied to an SGPR with readfirstlane.
2871 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2872 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2873
2874 SDValue Chain = N->getOperand(0);
2875 SDValue Ptr = N->getOperand(2);
2876 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2877 MachineMemOperand *MMO = M->getMemOperand();
2878 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2879
2882 SDValue PtrBase = Ptr.getOperand(0);
2883 SDValue PtrOffset = Ptr.getOperand(1);
2884
2885 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2886 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2887 N = glueCopyToM0(N, PtrBase);
2888 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2889 }
2890 }
2891
2892 if (!Offset) {
2893 N = glueCopyToM0(N, Ptr);
2894 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2895 }
2896
2897 SDValue Ops[] = {
2898 Offset,
2899 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2900 Chain,
2901 N->getOperand(N->getNumOperands() - 1) // New glue
2902 };
2903
2904 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2905 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2906}
2907
2908// We need to handle this here because tablegen doesn't support matching
2909// instructions with multiple outputs.
2910void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
2911 unsigned Opc;
2912 switch (IntrID) {
2913 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2914 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2915 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2916 break;
2917 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2918 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2919 break;
2920 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2921 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2922 break;
2923 }
2924 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2925 N->getOperand(5), N->getOperand(0)};
2926
2927 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2928 MachineMemOperand *MMO = M->getMemOperand();
2929 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2930 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2931}
2932
2933static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2934 switch (IntrID) {
2935 case Intrinsic::amdgcn_ds_gws_init:
2936 return AMDGPU::DS_GWS_INIT;
2937 case Intrinsic::amdgcn_ds_gws_barrier:
2938 return AMDGPU::DS_GWS_BARRIER;
2939 case Intrinsic::amdgcn_ds_gws_sema_v:
2940 return AMDGPU::DS_GWS_SEMA_V;
2941 case Intrinsic::amdgcn_ds_gws_sema_br:
2942 return AMDGPU::DS_GWS_SEMA_BR;
2943 case Intrinsic::amdgcn_ds_gws_sema_p:
2944 return AMDGPU::DS_GWS_SEMA_P;
2945 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2946 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2947 default:
2948 llvm_unreachable("not a gws intrinsic");
2949 }
2950}
2951
2952void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2953 if (!Subtarget->hasGWS() ||
2954 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2955 !Subtarget->hasGWSSemaReleaseAll())) {
2956 // Let this error.
2957 SelectCode(N);
2958 return;
2959 }
2960
2961 // Chain, intrinsic ID, vsrc, offset
2962 const bool HasVSrc = N->getNumOperands() == 4;
2963 assert(HasVSrc || N->getNumOperands() == 3);
2964
2965 SDLoc SL(N);
2966 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2967 int ImmOffset = 0;
2968 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2969 MachineMemOperand *MMO = M->getMemOperand();
2970
2971 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2972 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2973
2974 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2975 // offset field) % 64. Some versions of the programming guide omit the m0
2976 // part, or claim it's from offset 0.
2977 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2978 // If we have a constant offset, try to use the 0 in m0 as the base.
2979 // TODO: Look into changing the default m0 initialization value. If the
2980 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2981 // the immediate offset.
2982 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2983 ImmOffset = ConstOffset->getZExtValue();
2984 } else {
2985 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2986 ImmOffset = BaseOffset.getConstantOperandVal(1);
2987 BaseOffset = BaseOffset.getOperand(0);
2988 }
2989
2990 // Prefer to do the shift in an SGPR since it should be possible to use m0
2991 // as the result directly. If it's already an SGPR, it will be eliminated
2992 // later.
2993 SDNode *SGPROffset
2994 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2995 BaseOffset);
2996 // Shift to offset in m0
2997 SDNode *M0Base
2998 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2999 SDValue(SGPROffset, 0),
3000 CurDAG->getTargetConstant(16, SL, MVT::i32));
3001 glueCopyToM0(N, SDValue(M0Base, 0));
3002 }
3003
3004 SDValue Chain = N->getOperand(0);
3005 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
3006
3007 const unsigned Opc = gwsIntrinToOpcode(IntrID);
3009 if (HasVSrc)
3010 Ops.push_back(N->getOperand(2));
3011 Ops.push_back(OffsetField);
3012 Ops.push_back(Chain);
3013
3014 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3015 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3016}
3017
3018void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
3019 if (Subtarget->getLDSBankCount() != 16) {
3020 // This is a single instruction with a pattern.
3021 SelectCode(N);
3022 return;
3023 }
3024
3025 SDLoc DL(N);
3026
3027 // This requires 2 instructions. It is possible to write a pattern to support
3028 // this, but the generated isel emitter doesn't correctly deal with multiple
3029 // output instructions using the same physical register input. The copy to m0
3030 // is incorrectly placed before the second instruction.
3031 //
3032 // TODO: Match source modifiers.
3033 //
3034 // def : Pat <
3035 // (int_amdgcn_interp_p1_f16
3036 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
3037 // (i32 timm:$attrchan), (i32 timm:$attr),
3038 // (i1 timm:$high), M0),
3039 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
3040 // timm:$attrchan, 0,
3041 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
3042 // let Predicates = [has16BankLDS];
3043 // }
3044
3045 // 16 bank LDS
3046 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
3047 N->getOperand(5), SDValue());
3048
3049 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
3050
3051 SDNode *InterpMov =
3052 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
3053 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
3054 N->getOperand(3), // Attr
3055 N->getOperand(2), // Attrchan
3056 ToM0.getValue(1) // In glue
3057 });
3058
3059 SDNode *InterpP1LV =
3060 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
3061 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
3062 N->getOperand(1), // Src0
3063 N->getOperand(3), // Attr
3064 N->getOperand(2), // Attrchan
3065 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
3066 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
3067 N->getOperand(4), // high
3068 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
3069 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
3070 SDValue(InterpMov, 1)
3071 });
3072
3073 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
3074}
3075
3076void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
3077 unsigned IntrID = N->getConstantOperandVal(1);
3078 switch (IntrID) {
3079 case Intrinsic::amdgcn_ds_append:
3080 case Intrinsic::amdgcn_ds_consume: {
3081 if (N->getValueType(0) != MVT::i32)
3082 break;
3083 SelectDSAppendConsume(N, IntrID);
3084 return;
3085 }
3086 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3087 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3088 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3089 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3090 SelectDSBvhStackIntrinsic(N, IntrID);
3091 return;
3092 case Intrinsic::amdgcn_init_whole_wave:
3095 ->setInitWholeWave();
3096 break;
3097 }
3098
3099 SelectCode(N);
3100}
3101
3102void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
3103 unsigned IntrID = N->getConstantOperandVal(0);
3104 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
3105 SDNode *ConvGlueNode = N->getGluedNode();
3106 if (ConvGlueNode) {
3107 // FIXME: Possibly iterate over multiple glue nodes?
3108 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
3109 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
3110 ConvGlueNode =
3111 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
3112 MVT::Glue, SDValue(ConvGlueNode, 0));
3113 } else {
3114 ConvGlueNode = nullptr;
3115 }
3116 switch (IntrID) {
3117 case Intrinsic::amdgcn_wqm:
3118 Opcode = AMDGPU::WQM;
3119 break;
3120 case Intrinsic::amdgcn_softwqm:
3121 Opcode = AMDGPU::SOFT_WQM;
3122 break;
3123 case Intrinsic::amdgcn_wwm:
3124 case Intrinsic::amdgcn_strict_wwm:
3125 Opcode = AMDGPU::STRICT_WWM;
3126 break;
3127 case Intrinsic::amdgcn_strict_wqm:
3128 Opcode = AMDGPU::STRICT_WQM;
3129 break;
3130 case Intrinsic::amdgcn_interp_p1_f16:
3131 SelectInterpP1F16(N);
3132 return;
3133 case Intrinsic::amdgcn_permlane16_swap:
3134 case Intrinsic::amdgcn_permlane32_swap: {
3135 if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
3136 !Subtarget->hasPermlane16Swap()) ||
3137 (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3138 !Subtarget->hasPermlane32Swap())) {
3139 SelectCode(N); // Hit the default error
3140 return;
3141 }
3142
3143 Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3144 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3145 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3146
3147 SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
3148 if (ConvGlueNode)
3149 NewOps.push_back(SDValue(ConvGlueNode, 0));
3150
3151 bool FI = N->getConstantOperandVal(3);
3152 NewOps[2] = CurDAG->getTargetConstant(
3154
3155 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps);
3156 return;
3157 }
3158 default:
3159 SelectCode(N);
3160 break;
3161 }
3162
3163 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
3164 SDValue Src = N->getOperand(1);
3165 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
3166 }
3167
3168 if (ConvGlueNode) {
3169 SmallVector<SDValue, 4> NewOps(N->ops());
3170 NewOps.push_back(SDValue(ConvGlueNode, 0));
3171 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
3172 }
3173}
3174
3175void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
3176 unsigned IntrID = N->getConstantOperandVal(1);
3177 switch (IntrID) {
3178 case Intrinsic::amdgcn_ds_gws_init:
3179 case Intrinsic::amdgcn_ds_gws_barrier:
3180 case Intrinsic::amdgcn_ds_gws_sema_v:
3181 case Intrinsic::amdgcn_ds_gws_sema_br:
3182 case Intrinsic::amdgcn_ds_gws_sema_p:
3183 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3184 SelectDS_GWS(N, IntrID);
3185 return;
3186 default:
3187 break;
3188 }
3189
3190 SelectCode(N);
3191}
3192
3193void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
3194 SDValue Log2WaveSize =
3195 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
3196 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
3197 {N->getOperand(0), Log2WaveSize});
3198}
3199
3200void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
3201 SDValue SrcVal = N->getOperand(1);
3202 if (SrcVal.getValueType() != MVT::i32) {
3203 SelectCode(N); // Emit default error
3204 return;
3205 }
3206
3207 SDValue CopyVal;
3209 SDLoc SL(N);
3210
3211 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
3212 CopyVal = SrcVal.getOperand(0);
3213 } else {
3214 SDValue Log2WaveSize = CurDAG->getTargetConstant(
3215 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
3216
3217 if (N->isDivergent()) {
3218 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
3219 MVT::i32, SrcVal),
3220 0);
3221 }
3222
3223 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3224 {SrcVal, Log2WaveSize}),
3225 0);
3226 }
3227
3228 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
3229 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
3230}
3231
3232bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
3233 unsigned &Mods,
3234 bool IsCanonicalizing,
3235 bool AllowAbs) const {
3236 Mods = SISrcMods::NONE;
3237 Src = In;
3238
3239 if (Src.getOpcode() == ISD::FNEG) {
3240 Mods |= SISrcMods::NEG;
3241 Src = Src.getOperand(0);
3242 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
3243 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3244 // denormal mode, but we're implicitly canonicalizing in a source operand.
3245 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
3246 if (LHS && LHS->isZero()) {
3247 Mods |= SISrcMods::NEG;
3248 Src = Src.getOperand(1);
3249 }
3250 }
3251
3252 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
3253 Mods |= SISrcMods::ABS;
3254 Src = Src.getOperand(0);
3255 }
3256
3257 if (Mods != SISrcMods::NONE)
3258 return true;
3259
3260 // Convert various sign-bit masks on integers to src mods. Currently disabled
3261 // for 16-bit types as the codegen replaces the operand without adding a
3262 // srcmod. This is intentionally finding the cases where we are performing
3263 // float neg and abs on int types, the goal is not to obtain two's complement
3264 // neg or abs. Limit converison to select operands via the nonCanonalizing
3265 // pattern.
3266 // TODO: Add 16-bit support.
3267 if (IsCanonicalizing)
3268 return true;
3269
3270 unsigned Opc = Src->getOpcode();
3271 EVT VT = Src.getValueType();
3272 if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
3273 (VT != MVT::i32 && VT != MVT::i64))
3274 return true;
3275
3276 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Src->getOperand(1));
3277 if (!CRHS)
3278 return true;
3279
3280 // Recognise (xor a, 0x80000000) as NEG SrcMod.
3281 // Recognise (and a, 0x7fffffff) as ABS SrcMod.
3282 // Recognise (or a, 0x80000000) as NEG+ABS SrcModifiers.
3283 if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
3284 Mods |= SISrcMods::NEG;
3285 Src = Src.getOperand(0);
3286 } else if (Opc == ISD::AND && AllowAbs &&
3287 CRHS->getAPIntValue().isMaxSignedValue()) {
3288 Mods |= SISrcMods::ABS;
3289 Src = Src.getOperand(0);
3290 } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
3292 Src = Src.getOperand(0);
3293 }
3294
3295 return true;
3296}
3297
3298bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
3299 SDValue &SrcMods) const {
3300 unsigned Mods;
3301 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
3302 /*AllowAbs=*/true)) {
3303 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3304 return true;
3305 }
3306
3307 return false;
3308}
3309
3310bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
3311 SDValue In, SDValue &Src, SDValue &SrcMods) const {
3312 unsigned Mods;
3313 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
3314 /*AllowAbs=*/true)) {
3315 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3316 return true;
3317 }
3318
3319 return false;
3320}
3321
3322bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
3323 SDValue &SrcMods) const {
3324 unsigned Mods;
3325 if (SelectVOP3ModsImpl(In, Src, Mods,
3326 /*IsCanonicalizing=*/true,
3327 /*AllowAbs=*/false)) {
3328 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3329 return true;
3330 }
3331
3332 return false;
3333}
3334
3335bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
3336 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
3337 return false;
3338
3339 Src = In;
3340 return true;
3341}
3342
3343bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
3344 SDValue &SrcMods,
3345 bool OpSel) const {
3346 unsigned Mods;
3347 if (SelectVOP3ModsImpl(In, Src, Mods,
3348 /*IsCanonicalizing=*/true,
3349 /*AllowAbs=*/false)) {
3350 if (OpSel)
3351 Mods |= SISrcMods::OP_SEL_0;
3352 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3353 return true;
3354 }
3355
3356 return false;
3357}
3358
3359bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
3360 SDValue &SrcMods) const {
3361 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
3362}
3363
3364bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
3365 SDValue &SrcMods) const {
3366 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
3367}
3368
3369bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
3370 SDValue &SrcMods, SDValue &Clamp,
3371 SDValue &Omod) const {
3372 SDLoc DL(In);
3373 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3374 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3375
3376 return SelectVOP3Mods(In, Src, SrcMods);
3377}
3378
3379bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
3380 SDValue &SrcMods, SDValue &Clamp,
3381 SDValue &Omod) const {
3382 SDLoc DL(In);
3383 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3384 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3385
3386 return SelectVOP3BMods(In, Src, SrcMods);
3387}
3388
3389bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3390 SDValue &Clamp, SDValue &Omod) const {
3391 Src = In;
3392
3393 SDLoc DL(In);
3394 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3395 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3396
3397 return true;
3398}
3399
3400bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3401 SDValue &SrcMods, bool IsDOT) const {
3402 unsigned Mods = SISrcMods::NONE;
3403 Src = In;
3404
3405 // TODO: Handle G_FSUB 0 as fneg
3406 if (Src.getOpcode() == ISD::FNEG) {
3408 Src = Src.getOperand(0);
3409 }
3410
3411 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3412 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3413 unsigned VecMods = Mods;
3414
3415 SDValue Lo = stripBitcast(Src.getOperand(0));
3416 SDValue Hi = stripBitcast(Src.getOperand(1));
3417
3418 if (Lo.getOpcode() == ISD::FNEG) {
3419 Lo = stripBitcast(Lo.getOperand(0));
3420 Mods ^= SISrcMods::NEG;
3421 }
3422
3423 if (Hi.getOpcode() == ISD::FNEG) {
3424 Hi = stripBitcast(Hi.getOperand(0));
3425 Mods ^= SISrcMods::NEG_HI;
3426 }
3427
3428 if (isExtractHiElt(Lo, Lo))
3429 Mods |= SISrcMods::OP_SEL_0;
3430
3431 if (isExtractHiElt(Hi, Hi))
3432 Mods |= SISrcMods::OP_SEL_1;
3433
3434 unsigned VecSize = Src.getValueSizeInBits();
3435 Lo = stripExtractLoElt(Lo);
3436 Hi = stripExtractLoElt(Hi);
3437
3438 if (Lo.getValueSizeInBits() > VecSize) {
3440 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3441 MVT::getIntegerVT(VecSize), Lo);
3442 }
3443
3444 if (Hi.getValueSizeInBits() > VecSize) {
3446 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3447 MVT::getIntegerVT(VecSize), Hi);
3448 }
3449
3450 assert(Lo.getValueSizeInBits() <= VecSize &&
3451 Hi.getValueSizeInBits() <= VecSize);
3452
3453 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3454 // Really a scalar input. Just select from the low half of the register to
3455 // avoid packing.
3456
3457 if (VecSize == Lo.getValueSizeInBits()) {
3458 Src = Lo;
3459 } else if (VecSize == 32) {
3460 Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);
3461 } else {
3462 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3463
3464 SDLoc SL(In);
3466 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3467 Lo.getValueType()), 0);
3468 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3469 : AMDGPU::SReg_64RegClassID;
3470 const SDValue Ops[] = {
3471 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3472 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3473 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3474
3475 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3476 Src.getValueType(), Ops), 0);
3477 }
3478 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3479 return true;
3480 }
3481
3482 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3483 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3484 .bitcastToAPInt().getZExtValue();
3485 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3486 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3487 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3488 return true;
3489 }
3490 }
3491
3492 Mods = VecMods;
3493 } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3494 Src.getNumOperands() == 2) {
3495
3496 // TODO: We should repeat the build_vector source check above for the
3497 // vector_shuffle for negates and casts of individual elements.
3498
3499 auto *SVN = cast<ShuffleVectorSDNode>(Src);
3500 ArrayRef<int> Mask = SVN->getMask();
3501
3502 if (Mask[0] < 2 && Mask[1] < 2) {
3503 // src1 should be undef.
3504 SDValue ShuffleSrc = SVN->getOperand(0);
3505
3506 if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3507 ShuffleSrc = ShuffleSrc.getOperand(0);
3509 }
3510
3511 if (Mask[0] == 1)
3512 Mods |= SISrcMods::OP_SEL_0;
3513 if (Mask[1] == 1)
3514 Mods |= SISrcMods::OP_SEL_1;
3515
3516 Src = ShuffleSrc;
3517 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3518 return true;
3519 }
3520 }
3521
3522 // Packed instructions do not have abs modifiers.
3523 Mods |= SISrcMods::OP_SEL_1;
3524
3525 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3526 return true;
3527}
3528
3529bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3530 SDValue &SrcMods) const {
3531 return SelectVOP3PMods(In, Src, SrcMods, true);
3532}
3533
3534bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3535 SDValue &Src) const {
3536 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3537 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3538
3539 unsigned Mods = SISrcMods::OP_SEL_1;
3540 unsigned SrcVal = C->getZExtValue();
3541 if (SrcVal == 1)
3542 Mods |= SISrcMods::OP_SEL_0;
3543
3544 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3545 return true;
3546}
3547
3549 llvm::SelectionDAG *CurDAG,
3550 const SDLoc &DL) {
3551 unsigned DstRegClass;
3552 EVT DstTy;
3553 switch (Elts.size()) {
3554 case 8:
3555 DstRegClass = AMDGPU::VReg_256RegClassID;
3556 DstTy = MVT::v8i32;
3557 break;
3558 case 4:
3559 DstRegClass = AMDGPU::VReg_128RegClassID;
3560 DstTy = MVT::v4i32;
3561 break;
3562 case 2:
3563 DstRegClass = AMDGPU::VReg_64RegClassID;
3564 DstTy = MVT::v2i32;
3565 break;
3566 default:
3567 llvm_unreachable("unhandled Reg sequence size");
3568 }
3569
3571 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3572 for (unsigned i = 0; i < Elts.size(); ++i) {
3573 Ops.push_back(Elts[i]);
3574 Ops.push_back(CurDAG->getTargetConstant(
3576 }
3577 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3578}
3579
3581 llvm::SelectionDAG *CurDAG,
3582 const SDLoc &DL) {
3583 SmallVector<SDValue, 8> PackedElts;
3584 assert("unhandled Reg sequence size" &&
3585 (Elts.size() == 8 || Elts.size() == 16));
3586
3587 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3588 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3589 for (unsigned i = 0; i < Elts.size(); i += 2) {
3590 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3591 SDValue HiSrc;
3592 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3593 PackedElts.push_back(HiSrc);
3594 } else {
3595 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3596 MachineSDNode *Packed =
3597 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3598 {Elts[i + 1], Elts[i], PackLoLo});
3599 PackedElts.push_back(SDValue(Packed, 0));
3600 }
3601 }
3602
3603 return buildRegSequence32(PackedElts, CurDAG, DL);
3604}
3605
3607 llvm::SelectionDAG *CurDAG,
3608 const SDLoc &DL, unsigned ElementSize) {
3609 if (ElementSize == 16)
3610 return buildRegSequence16(Elts, CurDAG, DL);
3611 if (ElementSize == 32)
3612 return buildRegSequence32(Elts, CurDAG, DL);
3613 llvm_unreachable("Unhandled element size");
3614}
3615
3616static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3618 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3619 unsigned ElementSize) {
3620 if (ModOpcode == ISD::FNEG) {
3621 Mods |= SISrcMods::NEG;
3622 // Check if all elements also have abs modifier
3623 SmallVector<SDValue, 8> NegAbsElts;
3624 for (auto El : Elts) {
3625 if (El.getOpcode() != ISD::FABS)
3626 break;
3627 NegAbsElts.push_back(El->getOperand(0));
3628 }
3629 if (Elts.size() != NegAbsElts.size()) {
3630 // Neg
3631 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3632 } else {
3633 // Neg and Abs
3634 Mods |= SISrcMods::NEG_HI;
3635 Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3636 }
3637 } else {
3638 assert(ModOpcode == ISD::FABS);
3639 // Abs
3640 Mods |= SISrcMods::NEG_HI;
3641 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3642 }
3643}
3644
3645// Check all f16 elements for modifiers while looking through b32 and v2b16
3646// build vector, stop if element does not satisfy ModifierCheck.
3647static void
3649 std::function<bool(SDValue)> ModifierCheck) {
3650 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3651 if (auto *F16Pair =
3652 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3653 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3654 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3655 if (!ModifierCheck(ElF16))
3656 break;
3657 }
3658 }
3659 }
3660}
3661
3662bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3663 SDValue &SrcMods) const {
3664 Src = In;
3665 unsigned Mods = SISrcMods::OP_SEL_1;
3666
3667 // mods are on f16 elements
3668 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3670
3671 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3672 if (Element.getOpcode() != ISD::FNEG)
3673 return false;
3674 EltsF16.push_back(Element.getOperand(0));
3675 return true;
3676 });
3677
3678 // All elements have neg modifier
3679 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3680 Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3681 Mods |= SISrcMods::NEG;
3682 Mods |= SISrcMods::NEG_HI;
3683 }
3684 }
3685
3686 // mods are on v2f16 elements
3687 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3688 SmallVector<SDValue, 8> EltsV2F16;
3689 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3690 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3691 // Based on first element decide which mod we match, neg or abs
3692 if (ElV2f16.getOpcode() != ISD::FNEG)
3693 break;
3694 EltsV2F16.push_back(ElV2f16.getOperand(0));
3695 }
3696
3697 // All pairs of elements have neg modifier
3698 if (BV->getNumOperands() == EltsV2F16.size()) {
3699 Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3700 Mods |= SISrcMods::NEG;
3701 Mods |= SISrcMods::NEG_HI;
3702 }
3703 }
3704
3705 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3706 return true;
3707}
3708
3709bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3710 SDValue &SrcMods) const {
3711 Src = In;
3712 unsigned Mods = SISrcMods::OP_SEL_1;
3713 unsigned ModOpcode;
3714
3715 // mods are on f16 elements
3716 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3718 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3719 // Based on first element decide which mod we match, neg or abs
3720 if (EltsF16.empty())
3721 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3722 if (ElF16.getOpcode() != ModOpcode)
3723 return false;
3724 EltsF16.push_back(ElF16.getOperand(0));
3725 return true;
3726 });
3727
3728 // All elements have ModOpcode modifier
3729 if (BV->getNumOperands() * 2 == EltsF16.size())
3730 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3731 16);
3732 }
3733
3734 // mods are on v2f16 elements
3735 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3736 SmallVector<SDValue, 8> EltsV2F16;
3737
3738 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3739 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3740 // Based on first element decide which mod we match, neg or abs
3741 if (EltsV2F16.empty())
3742 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3743 if (ElV2f16->getOpcode() != ModOpcode)
3744 break;
3745 EltsV2F16.push_back(ElV2f16->getOperand(0));
3746 }
3747
3748 // All elements have ModOpcode modifier
3749 if (BV->getNumOperands() == EltsV2F16.size())
3750 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3751 32);
3752 }
3753
3754 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3755 return true;
3756}
3757
3758bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3759 SDValue &SrcMods) const {
3760 Src = In;
3761 unsigned Mods = SISrcMods::OP_SEL_1;
3763
3764 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3765 assert(BV->getNumOperands() > 0);
3766 // Based on first element decide which mod we match, neg or abs
3767 SDValue ElF32 = stripBitcast(BV->getOperand(0));
3768 unsigned ModOpcode =
3769 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3770 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3771 SDValue ElF32 = stripBitcast(BV->getOperand(i));
3772 if (ElF32.getOpcode() != ModOpcode)
3773 break;
3774 EltsF32.push_back(ElF32.getOperand(0));
3775 }
3776
3777 // All elements had ModOpcode modifier
3778 if (BV->getNumOperands() == EltsF32.size())
3779 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3780 32);
3781 }
3782
3783 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3784 return true;
3785}
3786
3787bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3788 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3789 BitVector UndefElements;
3790 if (SDValue Splat = BV->getSplatValue(&UndefElements))
3791 if (isInlineImmediate(Splat.getNode())) {
3792 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3793 unsigned Imm = C->getAPIntValue().getSExtValue();
3794 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3795 return true;
3796 }
3797 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3798 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3799 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3800 return true;
3801 }
3802 llvm_unreachable("unhandled Constant node");
3803 }
3804 }
3805
3806 // 16 bit splat
3807 SDValue SplatSrc32 = stripBitcast(In);
3808 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3809 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3810 SDValue SplatSrc16 = stripBitcast(Splat32);
3811 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3812 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3813 const SIInstrInfo *TII = Subtarget->getInstrInfo();
3814 std::optional<APInt> RawValue;
3815 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3816 RawValue = C->getValueAPF().bitcastToAPInt();
3817 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3818 RawValue = C->getAPIntValue();
3819
3820 if (RawValue.has_value()) {
3821 EVT VT = In.getValueType().getScalarType();
3822 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3823 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3826 RawValue.value());
3827 if (TII->isInlineConstant(FloatVal)) {
3828 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3829 MVT::i16);
3830 return true;
3831 }
3832 } else if (VT.getSimpleVT() == MVT::i16) {
3833 if (TII->isInlineConstant(RawValue.value())) {
3834 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3835 MVT::i16);
3836 return true;
3837 }
3838 } else
3839 llvm_unreachable("unknown 16-bit type");
3840 }
3841 }
3842 }
3843
3844 return false;
3845}
3846
3847bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3848 SDValue &IndexKey) const {
3849 unsigned Key = 0;
3850 Src = In;
3851
3852 if (In.getOpcode() == ISD::SRL) {
3853 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3854 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3855 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3856 ShiftAmt->getZExtValue() % 8 == 0) {
3857 Key = ShiftAmt->getZExtValue() / 8;
3858 Src = ShiftSrc;
3859 }
3860 }
3861
3862 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3863 return true;
3864}
3865
3866bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3867 SDValue &IndexKey) const {
3868 unsigned Key = 0;
3869 Src = In;
3870
3871 if (In.getOpcode() == ISD::SRL) {
3872 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3873 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3874 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3875 ShiftAmt->getZExtValue() == 16) {
3876 Key = 1;
3877 Src = ShiftSrc;
3878 }
3879 }
3880
3881 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3882 return true;
3883}
3884
3885bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src,
3886 SDValue &IndexKey) const {
3887 unsigned Key = 0;
3888 Src = In;
3889
3890 SDValue InI32;
3891
3892 if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) {
3893 const SDValue &ExtendSrc = In.getOperand(0);
3894 if (ExtendSrc.getValueSizeInBits() == 32)
3895 InI32 = ExtendSrc;
3896 } else if (In->getOpcode() == ISD::BITCAST) {
3897 const SDValue &CastSrc = In.getOperand(0);
3898 if (CastSrc.getOpcode() == ISD::BUILD_VECTOR &&
3899 CastSrc.getOperand(0).getValueSizeInBits() == 32) {
3900 ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(CastSrc.getOperand(1));
3901 if (Zero && Zero->getZExtValue() == 0)
3902 InI32 = CastSrc.getOperand(0);
3903 }
3904 }
3905
3906 if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
3907 const SDValue &ExtractVecEltSrc = InI32.getOperand(0);
3908 ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(InI32.getOperand(1));
3909 if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx &&
3910 EltIdx->getZExtValue() == 1) {
3911 Key = 1;
3912 Src = ExtractVecEltSrc;
3913 }
3914 }
3915
3916 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3917 return true;
3918}
3919
3920bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3921 SDValue &SrcMods) const {
3922 Src = In;
3923 // FIXME: Handle op_sel
3924 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3925 return true;
3926}
3927
3928bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3929 SDValue &SrcMods) const {
3930 // FIXME: Handle op_sel
3931 return SelectVOP3Mods(In, Src, SrcMods);
3932}
3933
3934// Match lowered fpext from bf16 to f32. This is a bit operation extending
3935// a 16-bit value with 16-bit of zeroes at LSB:
3936//
3937// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
3938// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
3939// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
3940static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
3941 if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
3942 return SDValue();
3943 Op = Op.getOperand(0);
3944
3945 IsExtractHigh = false;
3946 if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
3947 auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0));
3948 if (!Low16 || !Low16->isZero())
3949 return SDValue();
3950 Op = stripBitcast(Op.getOperand(1));
3951 if (Op.getValueType() != MVT::bf16)
3952 return SDValue();
3953 return Op;
3954 }
3955
3956 if (Op.getValueType() != MVT::i32)
3957 return SDValue();
3958
3959 if (Op.getOpcode() == ISD::AND) {
3960 if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3961 if (Mask->getZExtValue() == 0xffff0000) {
3962 IsExtractHigh = true;
3963 return Op.getOperand(0);
3964 }
3965 }
3966 return SDValue();
3967 }
3968
3969 if (Op.getOpcode() == ISD::SHL) {
3970 if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3971 if (Amt->getZExtValue() == 16)
3972 return Op.getOperand(0);
3973 }
3974 }
3975
3976 return SDValue();
3977}
3978
3979// The return value is not whether the match is possible (which it always is),
3980// but whether or not it a conversion is really used.
3981bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3982 unsigned &Mods,
3983 MVT VT) const {
3984 Mods = 0;
3985 SelectVOP3ModsImpl(In, Src, Mods);
3986
3987 bool IsExtractHigh = false;
3988 if (Src.getOpcode() == ISD::FP_EXTEND) {
3989 Src = Src.getOperand(0);
3990 } else if (VT == MVT::bf16) {
3991 SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh);
3992 if (!B16)
3993 return false;
3994 Src = B16;
3995 } else
3996 return false;
3997
3998 if (Src.getValueType() != VT &&
3999 (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
4000 return false;
4001
4002 Src = stripBitcast(Src);
4003
4004 // Be careful about folding modifiers if we already have an abs. fneg is
4005 // applied last, so we don't want to apply an earlier fneg.
4006 if ((Mods & SISrcMods::ABS) == 0) {
4007 unsigned ModsTmp;
4008 SelectVOP3ModsImpl(Src, Src, ModsTmp);
4009
4010 if ((ModsTmp & SISrcMods::NEG) != 0)
4011 Mods ^= SISrcMods::NEG;
4012
4013 if ((ModsTmp & SISrcMods::ABS) != 0)
4014 Mods |= SISrcMods::ABS;
4015 }
4016
4017 // op_sel/op_sel_hi decide the source type and source.
4018 // If the source's op_sel_hi is set, it indicates to do a conversion from
4019 // fp16. If the sources's op_sel is set, it picks the high half of the source
4020 // register.
4021
4022 Mods |= SISrcMods::OP_SEL_1;
4023 if (IsExtractHigh ||
4024 (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) {
4025 Mods |= SISrcMods::OP_SEL_0;
4026
4027 // TODO: Should we try to look for neg/abs here?
4028 }
4029
4030 // Prevent unnecessary subreg COPY to VGPR_16
4031 if (Src.getOpcode() == ISD::TRUNCATE &&
4032 Src.getOperand(0).getValueType() == MVT::i32) {
4033 Src = Src.getOperand(0);
4034 }
4035 return true;
4036}
4037
4038bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
4039 SDValue &SrcMods) const {
4040 unsigned Mods = 0;
4041 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16))
4042 return false;
4043 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4044 return true;
4045}
4046
4047bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
4048 SDValue &SrcMods) const {
4049 unsigned Mods = 0;
4050 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
4051 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4052 return true;
4053}
4054
4055bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
4056 SDValue &SrcMods) const {
4057 unsigned Mods = 0;
4058 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16))
4059 return false;
4060 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4061 return true;
4062}
4063
4064bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
4065 SDValue &SrcMods) const {
4066 unsigned Mods = 0;
4067 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
4068 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4069 return true;
4070}
4071
4072// Match BITOP3 operation and return a number of matched instructions plus
4073// truth table.
4074static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
4076 unsigned NumOpcodes = 0;
4077 uint8_t LHSBits, RHSBits;
4078
4079 auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
4080 // Define truth table given Src0, Src1, Src2 bits permutations:
4081 // 0 0 0
4082 // 0 0 1
4083 // 0 1 0
4084 // 0 1 1
4085 // 1 0 0
4086 // 1 0 1
4087 // 1 1 0
4088 // 1 1 1
4089 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4090
4091 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
4092 if (C->isAllOnes()) {
4093 Bits = 0xff;
4094 return true;
4095 }
4096 if (C->isZero()) {
4097 Bits = 0;
4098 return true;
4099 }
4100 }
4101
4102 for (unsigned I = 0; I < Src.size(); ++I) {
4103 // Try to find existing reused operand
4104 if (Src[I] == Op) {
4105 Bits = SrcBits[I];
4106 return true;
4107 }
4108 // Try to replace parent operator
4109 if (Src[I] == In) {
4110 Bits = SrcBits[I];
4111 Src[I] = Op;
4112 return true;
4113 }
4114 }
4115
4116 if (Src.size() == 3) {
4117 // No room left for operands. Try one last time, there can be a 'not' of
4118 // one of our source operands. In this case we can compute the bits
4119 // without growing Src vector.
4120 if (Op.getOpcode() == ISD::XOR) {
4121 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4122 if (C->isAllOnes()) {
4123 SDValue LHS = Op.getOperand(0);
4124 for (unsigned I = 0; I < Src.size(); ++I) {
4125 if (Src[I] == LHS) {
4126 Bits = ~SrcBits[I];
4127 return true;
4128 }
4129 }
4130 }
4131 }
4132 }
4133
4134 return false;
4135 }
4136
4137 Bits = SrcBits[Src.size()];
4138 Src.push_back(Op);
4139 return true;
4140 };
4141
4142 switch (In.getOpcode()) {
4143 case ISD::AND:
4144 case ISD::OR:
4145 case ISD::XOR: {
4146 SDValue LHS = In.getOperand(0);
4147 SDValue RHS = In.getOperand(1);
4148
4149 SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
4150 if (!getOperandBits(LHS, LHSBits) ||
4151 !getOperandBits(RHS, RHSBits)) {
4152 Src = Backup;
4153 return std::make_pair(0, 0);
4154 }
4155
4156 // Recursion is naturally limited by the size of the operand vector.
4157 auto Op = BitOp3_Op(LHS, Src);
4158 if (Op.first) {
4159 NumOpcodes += Op.first;
4160 LHSBits = Op.second;
4161 }
4162
4163 Op = BitOp3_Op(RHS, Src);
4164 if (Op.first) {
4165 NumOpcodes += Op.first;
4166 RHSBits = Op.second;
4167 }
4168 break;
4169 }
4170 default:
4171 return std::make_pair(0, 0);
4172 }
4173
4174 uint8_t TTbl;
4175 switch (In.getOpcode()) {
4176 case ISD::AND:
4177 TTbl = LHSBits & RHSBits;
4178 break;
4179 case ISD::OR:
4180 TTbl = LHSBits | RHSBits;
4181 break;
4182 case ISD::XOR:
4183 TTbl = LHSBits ^ RHSBits;
4184 break;
4185 default:
4186 break;
4187 }
4188
4189 return std::make_pair(NumOpcodes + 1, TTbl);
4190}
4191
4192bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
4193 SDValue &Src2, SDValue &Tbl) const {
4195 uint8_t TTbl;
4196 unsigned NumOpcodes;
4197
4198 std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
4199
4200 // Src.empty() case can happen if all operands are all zero or all ones.
4201 // Normally it shall be optimized out before reaching this.
4202 if (NumOpcodes < 2 || Src.empty())
4203 return false;
4204
4205 // For a uniform case threshold should be higher to account for moves between
4206 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
4207 // and a readtfirstlane after.
4208 if (NumOpcodes < 4 && !In->isDivergent())
4209 return false;
4210
4211 if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
4212 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4213 // asm more readable. This cannot be modeled with AddedComplexity because
4214 // selector does not know how many operations did we match.
4215 if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
4216 (In.getOperand(0).getOpcode() == In.getOpcode() ||
4217 In.getOperand(1).getOpcode() == In.getOpcode()))
4218 return false;
4219
4220 if (In.getOpcode() == ISD::OR &&
4221 (In.getOperand(0).getOpcode() == ISD::AND ||
4222 In.getOperand(1).getOpcode() == ISD::AND))
4223 return false;
4224 }
4225
4226 // Last operand can be ignored, turning a ternary operation into a binary.
4227 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4228 // 'c' with 'a' here without changing the answer. In some pathological
4229 // cases it should be possible to get an operation with a single operand
4230 // too if optimizer would not catch it.
4231 while (Src.size() < 3)
4232 Src.push_back(Src[0]);
4233
4234 Src0 = Src[0];
4235 Src1 = Src[1];
4236 Src2 = Src[2];
4237
4238 Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
4239 return true;
4240}
4241
4242SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
4243 if (In.isUndef())
4244 return CurDAG->getUNDEF(MVT::i32);
4245
4246 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
4247 SDLoc SL(In);
4248 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
4249 }
4250
4251 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
4252 SDLoc SL(In);
4253 return CurDAG->getConstant(
4254 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
4255 }
4256
4257 SDValue Src;
4258 if (isExtractHiElt(In, Src))
4259 return Src;
4260
4261 return SDValue();
4262}
4263
4264bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
4266
4267 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
4268 const SIInstrInfo *SII = Subtarget->getInstrInfo();
4269
4270 unsigned Limit = 0;
4271 bool AllUsesAcceptSReg = true;
4272 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
4273 Limit < 10 && U != E; ++U, ++Limit) {
4274 const TargetRegisterClass *RC =
4275 getOperandRegClass(U->getUser(), U->getOperandNo());
4276
4277 // If the register class is unknown, it could be an unknown
4278 // register class that needs to be an SGPR, e.g. an inline asm
4279 // constraint
4280 if (!RC || SIRI->isSGPRClass(RC))
4281 return false;
4282
4283 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
4284 AllUsesAcceptSReg = false;
4285 SDNode *User = U->getUser();
4286 if (User->isMachineOpcode()) {
4287 unsigned Opc = User->getMachineOpcode();
4288 const MCInstrDesc &Desc = SII->get(Opc);
4289 if (Desc.isCommutable()) {
4290 unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
4291 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
4292 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
4293 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
4294 const TargetRegisterClass *CommutedRC =
4295 getOperandRegClass(U->getUser(), CommutedOpNo);
4296 if (CommutedRC == &AMDGPU::VS_32RegClass ||
4297 CommutedRC == &AMDGPU::VS_64RegClass)
4298 AllUsesAcceptSReg = true;
4299 }
4300 }
4301 }
4302 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
4303 // commuting current user. This means have at least one use
4304 // that strictly require VGPR. Thus, we will not attempt to commute
4305 // other user instructions.
4306 if (!AllUsesAcceptSReg)
4307 break;
4308 }
4309 }
4310 return !AllUsesAcceptSReg && (Limit < 10);
4311}
4312
4313bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
4314 const auto *Ld = cast<LoadSDNode>(N);
4315
4316 const MachineMemOperand *MMO = Ld->getMemOperand();
4317 if (N->isDivergent() && !AMDGPU::isUniformMMO(MMO))
4318 return false;
4319
4320 return MMO->getSize().hasValue() &&
4321 Ld->getAlign() >=
4322 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
4323 uint64_t(4))) &&
4324 ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4325 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
4326 (Subtarget->getScalarizeGlobalBehavior() &&
4327 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
4328 Ld->isSimple() &&
4329 static_cast<const SITargetLowering *>(getTargetLowering())
4330 ->isMemOpHasNoClobberedMemOperand(N)));
4331}
4332
4335 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
4336 bool IsModified = false;
4337 do {
4338 IsModified = false;
4339
4340 // Go over all selected nodes and try to fold them a bit more
4342 while (Position != CurDAG->allnodes_end()) {
4343 SDNode *Node = &*Position++;
4344 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
4345 if (!MachineNode)
4346 continue;
4347
4348 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
4349 if (ResNode != Node) {
4350 if (ResNode)
4351 ReplaceUses(Node, ResNode);
4352 IsModified = true;
4353 }
4354 }
4356 } while (IsModified);
4357}
4358
4360 CodeGenOptLevel OptLevel)
4362 ID, std::make_unique<AMDGPUDAGToDAGISel>(TM, OptLevel)) {}
4363
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static MachineSDNode * buildRegSequence32(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, const SelectionDAG *DAG)
static MemSDNode * findMemSDNode(SDNode *N)
static MachineSDNode * buildRegSequence16(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Addr
uint64_t Size
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Register const TargetRegisterInfo * TRI
MachineInstr unsigned OpIdx
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:39
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
#define LLVM_DEBUG(...)
Definition: Debug.h:119
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
support::ulittle16_t & Lo
Definition: aarch32.cpp:205
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
bool matchLoadD16FromBuildVector(SDNode *N) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
AMDGPUISelDAGToDAGPass(TargetMachine &TM)
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static SDValue stripBitcast(SDValue Val)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Class for arbitrary precision integers.
Definition: APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1540
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:466
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition: APInt.h:405
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1562
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1656
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:412
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:233
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
This class represents an Operation in the Expression.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:284
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:322
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:165
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:314
bool hasPermlane32Swap() const
bool hasScalarCompareEq64() const
int getLDSBankCount() const
Definition: GCNSubtarget.h:379
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:513
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:517
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:678
bool hasDLInsts() const
Definition: GCNSubtarget.h:820
bool hasScaleOffset() const
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:308
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool hasPkMovB32() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:598
bool hasSignedScratchOffsets() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:320
bool hasDOTOpSelHazard() const
bool d16PreservesUnusedBits() const
Definition: GCNSubtarget.h:747
bool has64BitLiterals() const
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:735
bool hasMadU64U32NoCarry() const
bool getScalarizeGlobalBehavior() const
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:757
bool isWave32() const
bool hasPermlane16Swap() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasSignedGVSOffset() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:576
Generation getGeneration() const
Definition: GCNSubtarget.h:356
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:771
bool hasAddr64() const
Definition: GCNSubtarget.h:424
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:779
bool hasSALUFloatInsts() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:570
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:597
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:199
const Triple & getTargetTriple() const
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:112
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
const TargetRegisterClass * getRegClass(unsigned RCID) const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
MachineFunction * MF
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
virtual bool runOnMachineFunction(MachineFunction &mf)
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:229
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:500
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:813
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDNode * SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT)
These are used for target selectors to mutate the specified node to have the specified return type,...
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
allnodes_const_iterator allnodes_begin() const
Definition: SelectionDAG.h:558
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
allnodes_const_iterator allnodes_end() const
Definition: SelectionDAG.h:559
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:763
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:719
LLVM_ABI void dump() const
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
LLVM_ABI void RemoveDeadNodes()
This method deletes all unreachable nodes in the SelectionDAG.
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
bool isConstantValueOfAnyType(SDValue N) const
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:499
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:707
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:493
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:581
ilist< SDNode >::iterator allnodes_iterator
Definition: SelectionDAG.h:561
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
static const unsigned CommuteAnyOperandIndex
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
bool isAMDGCN() const
Tests whether the target is AMDGCN.
Definition: Triple.h:901
Legacy analysis pass which computes a CycleInfo.
LLVM Value Representation.
Definition: Value.h:75
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:169
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
bool isUniformMMO(const MachineMemOperand *MMO)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:801
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1236
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:270
@ ConstantFP
Definition: ISDOpcodes.h:87
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:289
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1141
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:215
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:862
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:975
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:249
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:1018
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1541
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:826
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:656
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:1002
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:528
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:228
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:225
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:636
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:832
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
Definition: ISDOpcodes.h:1059
@ TargetFrameIndex
Definition: ISDOpcodes.h:182
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:870
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:960
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:323
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:299
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:838
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1180
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:543
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1756
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1685
@ Undef
Value of the register doesn't matter.
constexpr const char32_t SBase
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:477
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:307
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:264
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:336
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:82
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:164
static SDNode * packConstantV2I16(const SDNode *N, SelectionDAG &DAG)
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:376
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:856
#define N
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:264
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:265
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Description of the encoding of one expression Op.
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:294
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:340
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:138
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition: KnownBits.h:122
Matching combinators.
static unsigned getSubRegFromChannel(unsigned Channel)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.