LLVM 22.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
32
33#ifdef EXPENSIVE_CHECKS
35#include "llvm/IR/Dominators.h"
36#endif
37
38#define DEBUG_TYPE "amdgpu-isel"
39
40using namespace llvm;
41
42//===----------------------------------------------------------------------===//
43// Instruction Selector Implementation
44//===----------------------------------------------------------------------===//
45
46namespace {
47static SDValue stripBitcast(SDValue Val) {
48 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
49}
50
51// Figure out if this is really an extract of the high 16-bits of a dword.
52static bool isExtractHiElt(SDValue In, SDValue &Out) {
53 In = stripBitcast(In);
54
55 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
56 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
57 if (!Idx->isOne())
58 return false;
59 Out = In.getOperand(0);
60 return true;
61 }
62 }
63
64 if (In.getOpcode() != ISD::TRUNCATE)
65 return false;
66
67 SDValue Srl = In.getOperand(0);
68 if (Srl.getOpcode() == ISD::SRL) {
69 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
70 if (ShiftAmt->getZExtValue() == 16) {
71 Out = stripBitcast(Srl.getOperand(0));
72 return true;
73 }
74 }
75 }
76
77 return false;
78}
79
80static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,
81 llvm::SelectionDAG *CurDAG,
82 const GCNSubtarget *Subtarget) {
83 if (!Subtarget->useRealTrue16Insts()) {
84 return Lo;
85 }
86
87 SDValue NewSrc;
88 SDLoc SL(Lo);
89
90 if (Lo->isDivergent()) {
91 SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
92 SL, Lo.getValueType()),
93 0);
94 const SDValue Ops[] = {
95 CurDAG->getTargetConstant(AMDGPU::VGPR_32RegClassID, SL, MVT::i32), Lo,
96 CurDAG->getTargetConstant(AMDGPU::lo16, SL, MVT::i16), Undef,
97 CurDAG->getTargetConstant(AMDGPU::hi16, SL, MVT::i16)};
98
99 NewSrc = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
100 Src.getValueType(), Ops),
101 0);
102 } else {
103 // the S_MOV is needed since the Lo could still be a VGPR16.
104 // With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on
105 // the fixvgpr2sgprcopy pass to legalize it
106 NewSrc = SDValue(
107 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, Src.getValueType(), Lo),
108 0);
109 }
110
111 return NewSrc;
112}
113
114// Look through operations that obscure just looking at the low 16-bits of the
115// same register.
116static SDValue stripExtractLoElt(SDValue In) {
117 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
118 SDValue Idx = In.getOperand(1);
119 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
120 return In.getOperand(0);
121 }
122
123 if (In.getOpcode() == ISD::TRUNCATE) {
124 SDValue Src = In.getOperand(0);
125 if (Src.getValueType().getSizeInBits() == 32)
126 return stripBitcast(Src);
127 }
128
129 return In;
130}
131
132} // end anonymous namespace
133
135 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
136 false)
138INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
140#ifdef EXPENSIVE_CHECKS
143#endif
145 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
146 false)
147
148/// This pass converts a legalized DAG into a AMDGPU-specific
149// DAG, ready for instruction scheduling.
151 CodeGenOptLevel OptLevel) {
152 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
153}
154
158
160 Subtarget = &MF.getSubtarget<GCNSubtarget>();
161 Subtarget->checkSubtargetFeatures(MF.getFunction());
162 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
164}
165
166bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
167 // XXX - only need to list legal operations.
168 switch (Opc) {
169 case ISD::FADD:
170 case ISD::FSUB:
171 case ISD::FMUL:
172 case ISD::FDIV:
173 case ISD::FREM:
175 case ISD::UINT_TO_FP:
176 case ISD::SINT_TO_FP:
177 case ISD::FABS:
178 // Fabs is lowered to a bit operation, but it's an and which will clear the
179 // high bits anyway.
180 case ISD::FSQRT:
181 case ISD::FSIN:
182 case ISD::FCOS:
183 case ISD::FPOWI:
184 case ISD::FPOW:
185 case ISD::FLOG:
186 case ISD::FLOG2:
187 case ISD::FLOG10:
188 case ISD::FEXP:
189 case ISD::FEXP2:
190 case ISD::FCEIL:
191 case ISD::FTRUNC:
192 case ISD::FRINT:
193 case ISD::FNEARBYINT:
194 case ISD::FROUNDEVEN:
195 case ISD::FROUND:
196 case ISD::FFLOOR:
197 case ISD::FMINNUM:
198 case ISD::FMAXNUM:
199 case ISD::FLDEXP:
200 case AMDGPUISD::FRACT:
201 case AMDGPUISD::CLAMP:
204 case AMDGPUISD::FMIN3:
205 case AMDGPUISD::FMAX3:
206 case AMDGPUISD::FMED3:
208 case AMDGPUISD::RCP:
209 case AMDGPUISD::RSQ:
211 // On gfx10, all 16-bit instructions preserve the high bits.
212 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
213 case ISD::FP_ROUND:
214 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
215 // high bits on gfx9.
216 // TODO: If we had the source node we could see if the source was fma/mad
218 case ISD::FMA:
219 case ISD::FMAD:
222 default:
223 // fcopysign, select and others may be lowered to 32-bit bit operations
224 // which don't zero the high bits.
225 return false;
226 }
227}
228
230#ifdef EXPENSIVE_CHECKS
232 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
233 for (auto &L : LI->getLoopsInPreorder()) {
234 assert(L->isLCSSAForm(DT));
235 }
236#endif
238}
239
249
251 assert(Subtarget->d16PreservesUnusedBits());
252 MVT VT = N->getValueType(0).getSimpleVT();
253 if (VT != MVT::v2i16 && VT != MVT::v2f16)
254 return false;
255
256 SDValue Lo = N->getOperand(0);
257 SDValue Hi = N->getOperand(1);
258
259 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
260
261 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
262 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
263 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
264
265 // Need to check for possible indirect dependencies on the other half of the
266 // vector to avoid introducing a cycle.
267 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
268 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
269
270 SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
271 SDValue Ops[] = {
272 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
273 };
274
275 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
276 if (LdHi->getMemoryVT() == MVT::i8) {
277 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
279 } else {
280 assert(LdHi->getMemoryVT() == MVT::i16);
281 }
282
283 SDValue NewLoadHi =
284 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
285 Ops, LdHi->getMemoryVT(),
286 LdHi->getMemOperand());
287
288 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
289 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
290 return true;
291 }
292
293 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
294 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
295 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
296 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
297 if (LdLo && Lo.hasOneUse()) {
298 SDValue TiedIn = getHi16Elt(Hi);
299 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
300 return false;
301
302 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
303 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
304 if (LdLo->getMemoryVT() == MVT::i8) {
305 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
307 } else {
308 assert(LdLo->getMemoryVT() == MVT::i16);
309 }
310
311 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
312
313 SDValue Ops[] = {
314 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
315 };
316
317 SDValue NewLoadLo =
318 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
319 Ops, LdLo->getMemoryVT(),
320 LdLo->getMemOperand());
321
322 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
323 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
324 return true;
325 }
326
327 return false;
328}
329
331 if (!Subtarget->d16PreservesUnusedBits())
332 return;
333
334 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
335
336 bool MadeChange = false;
337 while (Position != CurDAG->allnodes_begin()) {
338 SDNode *N = &*--Position;
339 if (N->use_empty())
340 continue;
341
342 switch (N->getOpcode()) {
344 // TODO: Match load d16 from shl (extload:i16), 16
345 MadeChange |= matchLoadD16FromBuildVector(N);
346 break;
347 default:
348 break;
349 }
350 }
351
352 if (MadeChange) {
353 CurDAG->RemoveDeadNodes();
354 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
355 CurDAG->dump(););
356 }
357}
358
359bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
360 if (N->isUndef())
361 return true;
362
363 const SIInstrInfo *TII = Subtarget->getInstrInfo();
365 return TII->isInlineConstant(C->getAPIntValue());
366
368 return TII->isInlineConstant(C->getValueAPF());
369
370 return false;
371}
372
373/// Determine the register class for \p OpNo
374/// \returns The register class of the virtual register that will be used for
375/// the given operand number \OpNo or NULL if the register class cannot be
376/// determined.
377const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
378 unsigned OpNo) const {
379 if (!N->isMachineOpcode()) {
380 if (N->getOpcode() == ISD::CopyToReg) {
381 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
382 if (Reg.isVirtual()) {
384 return MRI.getRegClass(Reg);
385 }
386
387 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
388 return TRI->getPhysRegBaseClass(Reg);
389 }
390
391 return nullptr;
392 }
393
394 switch (N->getMachineOpcode()) {
395 default: {
396 const MCInstrDesc &Desc =
397 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
398 unsigned OpIdx = Desc.getNumDefs() + OpNo;
399 if (OpIdx >= Desc.getNumOperands())
400 return nullptr;
401 int RegClass = Desc.operands()[OpIdx].RegClass;
402 if (RegClass == -1)
403 return nullptr;
404
405 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
406 }
407 case AMDGPU::REG_SEQUENCE: {
408 unsigned RCID = N->getConstantOperandVal(0);
409 const TargetRegisterClass *SuperRC =
410 Subtarget->getRegisterInfo()->getRegClass(RCID);
411
412 SDValue SubRegOp = N->getOperand(OpNo + 1);
413 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
414 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
415 SubRegIdx);
416 }
417 }
418}
419
420SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
421 SDValue Glue) const {
423 Ops.push_back(NewChain); // Replace the chain.
424 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
425 Ops.push_back(N->getOperand(i));
426
427 Ops.push_back(Glue);
428 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
429}
430
431SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
432 const SITargetLowering& Lowering =
433 *static_cast<const SITargetLowering*>(getTargetLowering());
434
435 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
436
437 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
438 return glueCopyToOp(N, M0, M0.getValue(1));
439}
440
441SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
442 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
443 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
444 if (Subtarget->ldsRequiresM0Init())
445 return glueCopyToM0(
446 N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));
447 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
448 MachineFunction &MF = CurDAG->getMachineFunction();
449 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
450 return
451 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
452 }
453 return N;
454}
455
456MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
457 EVT VT) const {
458 SDNode *Lo = CurDAG->getMachineNode(
459 AMDGPU::S_MOV_B32, DL, MVT::i32,
460 CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32));
461 SDNode *Hi = CurDAG->getMachineNode(
462 AMDGPU::S_MOV_B32, DL, MVT::i32,
463 CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32));
464 const SDValue Ops[] = {
465 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
466 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
467 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
468
469 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
470}
471
472void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
473 EVT VT = N->getValueType(0);
474 unsigned NumVectorElts = VT.getVectorNumElements();
475 EVT EltVT = VT.getVectorElementType();
476 SDLoc DL(N);
477 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
478
479 if (NumVectorElts == 1) {
480 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
481 RegClass);
482 return;
483 }
484
485 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
486 if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
487 CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) {
488 uint64_t C = 0;
489 bool AllConst = true;
490 unsigned EltSize = EltVT.getSizeInBits();
491 for (unsigned I = 0; I < NumVectorElts; ++I) {
492 SDValue Op = N->getOperand(I);
493 if (Op.isUndef()) {
494 AllConst = false;
495 break;
496 }
497 uint64_t Val;
499 Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
500 } else
501 Val = cast<ConstantSDNode>(Op)->getZExtValue();
502 C |= Val << (EltSize * I);
503 }
504 if (AllConst) {
505 SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64);
506 MachineSDNode *Copy =
507 CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, VT, CV);
508 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0),
509 RegClass);
510 return;
511 }
512 }
513
514 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
515 "supported yet");
516 // 32 = Max Num Vector Elements
517 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
518 // 1 = Vector Register Class
519 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
520
521 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
522 bool IsRegSeq = true;
523 unsigned NOps = N->getNumOperands();
524 for (unsigned i = 0; i < NOps; i++) {
525 // XXX: Why is this here?
526 if (isa<RegisterSDNode>(N->getOperand(i))) {
527 IsRegSeq = false;
528 break;
529 }
530 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
532 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
533 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
534 }
535 if (NOps != NumVectorElts) {
536 // Fill in the missing undef elements if this was a scalar_to_vector.
537 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
538 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
539 DL, EltVT);
540 for (unsigned i = NOps; i < NumVectorElts; ++i) {
541 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
543 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
544 RegSeqArgs[1 + (2 * i) + 1] =
545 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
546 }
547 }
548
549 if (!IsRegSeq)
550 SelectCode(N);
551 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
552}
553
555 EVT VT = N->getValueType(0);
556 EVT EltVT = VT.getVectorElementType();
557
558 // TODO: Handle 16-bit element vectors with even aligned masks.
559 if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
560 VT.getVectorNumElements() != 2) {
561 SelectCode(N);
562 return;
563 }
564
565 auto *SVN = cast<ShuffleVectorSDNode>(N);
566
567 SDValue Src0 = SVN->getOperand(0);
568 SDValue Src1 = SVN->getOperand(1);
569 ArrayRef<int> Mask = SVN->getMask();
570 SDLoc DL(N);
571
572 assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
573 Mask[0] < 4 && Mask[1] < 4);
574
575 SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
576 SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
577 unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
578 unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
579
580 if (Mask[0] < 0) {
581 Src0SubReg = Src1SubReg;
582 MachineSDNode *ImpDef =
583 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
584 VSrc0 = SDValue(ImpDef, 0);
585 }
586
587 if (Mask[1] < 0) {
588 Src1SubReg = Src0SubReg;
589 MachineSDNode *ImpDef =
590 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
591 VSrc1 = SDValue(ImpDef, 0);
592 }
593
594 // SGPR case needs to lower to copies.
595 //
596 // Also use subregister extract when we can directly blend the registers with
597 // a simple subregister copy.
598 //
599 // TODO: Maybe we should fold this out earlier
600 if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
601 Src1SubReg == AMDGPU::sub0) {
602 // The low element of the result always comes from src0.
603 // The high element of the result always comes from src1.
604 // op_sel selects the high half of src0.
605 // op_sel_hi selects the high half of src1.
606
607 unsigned Src0OpSel =
608 Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
609 unsigned Src1OpSel =
610 Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
611
612 // Enable op_sel_hi to avoid printing it. This should have no effect on the
613 // result.
614 Src0OpSel |= SISrcMods::OP_SEL_1;
615 Src1OpSel |= SISrcMods::OP_SEL_1;
616
617 SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
618 SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
619 SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);
620
621 CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
622 {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
623 ZeroMods, // clamp
624 ZeroMods, // op_sel
625 ZeroMods, // op_sel_hi
626 ZeroMods, // neg_lo
627 ZeroMods}); // neg_hi
628 return;
629 }
630
631 SDValue ResultElt0 =
632 CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
633 SDValue ResultElt1 =
634 CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);
635
636 const SDValue Ops[] = {
637 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
638 ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
639 ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
640 CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
641}
642
644 unsigned int Opc = N->getOpcode();
645 if (N->isMachineOpcode()) {
646 N->setNodeId(-1);
647 return; // Already selected.
648 }
649
650 // isa<MemSDNode> almost works but is slightly too permissive for some DS
651 // intrinsics.
652 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
653 N = glueCopyToM0LDSInit(N);
654 SelectCode(N);
655 return;
656 }
657
658 switch (Opc) {
659 default:
660 break;
661 // We are selecting i64 ADD here instead of custom lower it during
662 // DAG legalization, so we can fold some i64 ADDs used for address
663 // calculation into the LOAD and STORE instructions.
664 case ISD::ADDC:
665 case ISD::ADDE:
666 case ISD::SUBC:
667 case ISD::SUBE: {
668 if (N->getValueType(0) != MVT::i64)
669 break;
670
671 SelectADD_SUB_I64(N);
672 return;
673 }
674 case ISD::UADDO_CARRY:
675 case ISD::USUBO_CARRY:
676 if (N->getValueType(0) != MVT::i32)
677 break;
678
679 SelectAddcSubb(N);
680 return;
681 case ISD::UADDO:
682 case ISD::USUBO: {
683 SelectUADDO_USUBO(N);
684 return;
685 }
687 SelectFMUL_W_CHAIN(N);
688 return;
689 }
691 SelectFMA_W_CHAIN(N);
692 return;
693 }
694
696 case ISD::BUILD_VECTOR: {
697 EVT VT = N->getValueType(0);
698 unsigned NumVectorElts = VT.getVectorNumElements();
699 if (VT.getScalarSizeInBits() == 16) {
700 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
701 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
702 ReplaceNode(N, Packed);
703 return;
704 }
705 }
706
707 break;
708 }
709
710 assert(VT.getVectorElementType().bitsEq(MVT::i32));
711 unsigned RegClassID =
712 SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
713 SelectBuildVector(N, RegClassID);
714 return;
715 }
718 return;
719 case ISD::BUILD_PAIR: {
720 SDValue RC, SubReg0, SubReg1;
721 SDLoc DL(N);
722 if (N->getValueType(0) == MVT::i128) {
723 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
724 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
725 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
726 } else if (N->getValueType(0) == MVT::i64) {
727 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
728 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
729 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
730 } else {
731 llvm_unreachable("Unhandled value type for BUILD_PAIR");
732 }
733 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
734 N->getOperand(1), SubReg1 };
735 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
736 N->getValueType(0), Ops));
737 return;
738 }
739
740 case ISD::Constant:
741 case ISD::ConstantFP: {
742 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) ||
743 Subtarget->has64BitLiterals())
744 break;
745
746 uint64_t Imm;
748 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
749 if (AMDGPU::isValid32BitLiteral(Imm, true))
750 break;
751 } else {
753 Imm = C->getZExtValue();
754 if (AMDGPU::isValid32BitLiteral(Imm, false))
755 break;
756 }
757
758 SDLoc DL(N);
759 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
760 return;
761 }
763 case AMDGPUISD::BFE_U32: {
764 // There is a scalar version available, but unlike the vector version which
765 // has a separate operand for the offset and width, the scalar version packs
766 // the width and offset into a single operand. Try to move to the scalar
767 // version if the offsets are constant, so that we can try to keep extended
768 // loads of kernel arguments in SGPRs.
769
770 // TODO: Technically we could try to pattern match scalar bitshifts of
771 // dynamic values, but it's probably not useful.
773 if (!Offset)
774 break;
775
776 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
777 if (!Width)
778 break;
779
780 bool Signed = Opc == AMDGPUISD::BFE_I32;
781
782 uint32_t OffsetVal = Offset->getZExtValue();
783 uint32_t WidthVal = Width->getZExtValue();
784
785 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
786 WidthVal));
787 return;
788 }
790 SelectDIV_SCALE(N);
791 return;
792 }
795 SelectMAD_64_32(N);
796 return;
797 }
798 case ISD::SMUL_LOHI:
799 case ISD::UMUL_LOHI:
800 return SelectMUL_LOHI(N);
801 case ISD::CopyToReg: {
803 *static_cast<const SITargetLowering*>(getTargetLowering());
804 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
805 break;
806 }
807 case ISD::AND:
808 case ISD::SRL:
809 case ISD::SRA:
811 if (N->getValueType(0) != MVT::i32)
812 break;
813
814 SelectS_BFE(N);
815 return;
816 case ISD::BRCOND:
817 SelectBRCOND(N);
818 return;
819 case ISD::FP_EXTEND:
820 SelectFP_EXTEND(N);
821 return;
827 // Hack around using a legal type if f16 is illegal.
828 if (N->getValueType(0) == MVT::i32) {
829 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
830 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
831 { N->getOperand(0), N->getOperand(1) });
832 SelectCode(N);
833 return;
834 }
835
836 break;
837 }
839 SelectINTRINSIC_W_CHAIN(N);
840 return;
841 }
843 SelectINTRINSIC_WO_CHAIN(N);
844 return;
845 }
846 case ISD::INTRINSIC_VOID: {
847 SelectINTRINSIC_VOID(N);
848 return;
849 }
851 SelectWAVE_ADDRESS(N);
852 return;
853 }
854 case ISD::STACKRESTORE: {
855 SelectSTACKRESTORE(N);
856 return;
857 }
858 }
859
860 SelectCode(N);
861}
862
863bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
864 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
865 const Instruction *Term = BB->getTerminator();
866 return Term->getMetadata("amdgpu.uniform") ||
867 Term->getMetadata("structurizecfg.uniform");
868}
869
870bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
871 unsigned ShAmtBits) const {
872 assert(N->getOpcode() == ISD::AND);
873
874 const APInt &RHS = N->getConstantOperandAPInt(1);
875 if (RHS.countr_one() >= ShAmtBits)
876 return true;
877
878 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
879 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
880}
881
883 SDValue &N0, SDValue &N1) {
884 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
886 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
887 // (i64 (bitcast (v2i32 (build_vector
888 // (or (extract_vector_elt V, 0), OFFSET),
889 // (extract_vector_elt V, 1)))))
890 SDValue Lo = Addr.getOperand(0).getOperand(0);
891 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
892 SDValue BaseLo = Lo.getOperand(0);
893 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
894 // Check that split base (Lo and Hi) are extracted from the same one.
895 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
897 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
898 // Lo is statically extracted from index 0.
899 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
900 BaseLo.getConstantOperandVal(1) == 0 &&
901 // Hi is statically extracted from index 0.
902 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
903 BaseHi.getConstantOperandVal(1) == 1) {
904 N0 = BaseLo.getOperand(0).getOperand(0);
905 N1 = Lo.getOperand(1);
906 return true;
907 }
908 }
909 }
910 return false;
911}
912
913bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
914 SDValue &RHS) const {
915 if (CurDAG->isBaseWithConstantOffset(Addr)) {
916 LHS = Addr.getOperand(0);
917 RHS = Addr.getOperand(1);
918 return true;
919 }
920
923 return true;
924 }
925
926 return false;
927}
928
930 return "AMDGPU DAG->DAG Pattern Instruction Selection";
931}
932
936
940#ifdef EXPENSIVE_CHECKS
942 .getManager();
943 auto &F = MF.getFunction();
944 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
945 LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
946 for (auto &L : LI.getLoopsInPreorder())
947 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
948#endif
949 return SelectionDAGISelPass::run(MF, MFAM);
950}
951
952//===----------------------------------------------------------------------===//
953// Complex Patterns
954//===----------------------------------------------------------------------===//
955
956bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
957 SDValue &Offset) {
958 return false;
959}
960
961bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
962 SDValue &Offset) {
964 SDLoc DL(Addr);
965
966 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
967 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
968 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
969 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
971 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
972 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
973 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
975 Base = Addr.getOperand(0);
976 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
977 } else {
978 Base = Addr;
979 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
980 }
981
982 return true;
983}
984
985SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
986 const SDLoc &DL) const {
987 SDNode *Mov = CurDAG->getMachineNode(
988 AMDGPU::S_MOV_B32, DL, MVT::i32,
989 CurDAG->getTargetConstant(Val, DL, MVT::i32));
990 return SDValue(Mov, 0);
991}
992
993// FIXME: Should only handle uaddo_carry/usubo_carry
994void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
995 SDLoc DL(N);
996 SDValue LHS = N->getOperand(0);
997 SDValue RHS = N->getOperand(1);
998
999 unsigned Opcode = N->getOpcode();
1000 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
1001 bool ProduceCarry =
1002 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
1003 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
1004
1005 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1006 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1007
1008 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1009 DL, MVT::i32, LHS, Sub0);
1010 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1011 DL, MVT::i32, LHS, Sub1);
1012
1013 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1014 DL, MVT::i32, RHS, Sub0);
1015 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1016 DL, MVT::i32, RHS, Sub1);
1017
1018 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
1019
1020 static const unsigned OpcMap[2][2][2] = {
1021 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
1022 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
1023 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
1024 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
1025
1026 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
1027 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
1028
1029 SDNode *AddLo;
1030 if (!ConsumeCarry) {
1031 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
1032 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
1033 } else {
1034 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
1035 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1036 }
1037 SDValue AddHiArgs[] = {
1038 SDValue(Hi0, 0),
1039 SDValue(Hi1, 0),
1040 SDValue(AddLo, 1)
1041 };
1042 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
1043
1044 SDValue RegSequenceArgs[] = {
1045 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
1046 SDValue(AddLo,0),
1047 Sub0,
1048 SDValue(AddHi,0),
1049 Sub1,
1050 };
1051 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1052 MVT::i64, RegSequenceArgs);
1053
1054 if (ProduceCarry) {
1055 // Replace the carry-use
1056 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
1057 }
1058
1059 // Replace the remaining uses.
1060 ReplaceNode(N, RegSequence);
1061}
1062
1063void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1064 SDValue LHS = N->getOperand(0);
1065 SDValue RHS = N->getOperand(1);
1066 SDValue CI = N->getOperand(2);
1067
1068 if (N->isDivergent()) {
1069 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
1070 : AMDGPU::V_SUBB_U32_e64;
1071 CurDAG->SelectNodeTo(
1072 N, Opc, N->getVTList(),
1073 {LHS, RHS, CI,
1074 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1075 } else {
1076 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
1077 : AMDGPU::S_SUB_CO_PSEUDO;
1078 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
1079 }
1080}
1081
1082void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1083 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1084 // carry out despite the _i32 name. These were renamed in VI to _U32.
1085 // FIXME: We should probably rename the opcodes here.
1086 bool IsAdd = N->getOpcode() == ISD::UADDO;
1087 bool IsVALU = N->isDivergent();
1088
1089 for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
1090 ++UI)
1091 if (UI.getUse().getResNo() == 1) {
1092 if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
1093 (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
1094 IsVALU = true;
1095 break;
1096 }
1097 }
1098
1099 if (IsVALU) {
1100 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1101
1102 CurDAG->SelectNodeTo(
1103 N, Opc, N->getVTList(),
1104 {N->getOperand(0), N->getOperand(1),
1105 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1106 } else {
1107 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
1108 : AMDGPU::S_USUBO_PSEUDO;
1109
1110 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
1111 {N->getOperand(0), N->getOperand(1)});
1112 }
1113}
1114
1115void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1116 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1117 SDValue Ops[10];
1118
1119 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
1120 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1121 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
1122 Ops[8] = N->getOperand(0);
1123 Ops[9] = N->getOperand(4);
1124
1125 // If there are no source modifiers, prefer fmac over fma because it can use
1126 // the smaller VOP2 encoding.
1127 bool UseFMAC = Subtarget->hasDLInsts() &&
1128 cast<ConstantSDNode>(Ops[0])->isZero() &&
1129 cast<ConstantSDNode>(Ops[2])->isZero() &&
1130 cast<ConstantSDNode>(Ops[4])->isZero();
1131 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
1132 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
1133}
1134
1135void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1136 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1137 SDValue Ops[8];
1138
1139 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
1140 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1141 Ops[6] = N->getOperand(0);
1142 Ops[7] = N->getOperand(3);
1143
1144 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
1145}
1146
1147// We need to handle this here because tablegen doesn't support matching
1148// instructions with multiple outputs.
1149void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1150 EVT VT = N->getValueType(0);
1151
1152 assert(VT == MVT::f32 || VT == MVT::f64);
1153
1154 unsigned Opc
1155 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1156
1157 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1158 // omod
1159 SDValue Ops[8];
1160 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1161 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1162 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1163 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1164}
1165
1166// We need to handle this here because tablegen doesn't support matching
1167// instructions with multiple outputs.
1168void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1169 SDLoc SL(N);
1170 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1171 unsigned Opc;
1172 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1);
1173 if (Subtarget->hasMADIntraFwdBug())
1174 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1175 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1176 else if (UseNoCarry)
1177 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1178 else
1179 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1180
1181 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1182 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1183 Clamp };
1184
1185 if (UseNoCarry) {
1186 MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
1187 ReplaceUses(SDValue(N, 0), SDValue(Mad, 0));
1188 CurDAG->RemoveDeadNode(N);
1189 return;
1190 }
1191
1192 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1193}
1194
1195// We need to handle this here because tablegen doesn't support matching
1196// instructions with multiple outputs.
1197void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1198 SDLoc SL(N);
1199 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1200 SDVTList VTList;
1201 unsigned Opc;
1202 if (Subtarget->hasMadU64U32NoCarry()) {
1203 VTList = CurDAG->getVTList(MVT::i64);
1204 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1205 } else {
1206 VTList = CurDAG->getVTList(MVT::i64, MVT::i1);
1207 if (Subtarget->hasMADIntraFwdBug()) {
1208 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1209 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1210 } else {
1211 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1212 }
1213 }
1214
1215 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1216 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1217 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1218 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, VTList, Ops);
1219 if (!SDValue(N, 0).use_empty()) {
1220 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1221 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1222 MVT::i32, SDValue(Mad, 0), Sub0);
1223 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1224 }
1225 if (!SDValue(N, 1).use_empty()) {
1226 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1227 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1228 MVT::i32, SDValue(Mad, 0), Sub1);
1229 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1230 }
1231 CurDAG->RemoveDeadNode(N);
1232}
1233
1234bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1235 if (!isUInt<16>(Offset))
1236 return false;
1237
1238 if (!Base || Subtarget->hasUsableDSOffset() ||
1239 Subtarget->unsafeDSOffsetFoldingEnabled())
1240 return true;
1241
1242 // On Southern Islands instruction with a negative base value and an offset
1243 // don't seem to work.
1244 return CurDAG->SignBitIsZero(Base);
1245}
1246
1247bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1248 SDValue &Offset) const {
1249 SDLoc DL(Addr);
1250 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1251 SDValue N0 = Addr.getOperand(0);
1252 SDValue N1 = Addr.getOperand(1);
1253 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1254 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1255 // (add n0, c0)
1256 Base = N0;
1257 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1258 return true;
1259 }
1260 } else if (Addr.getOpcode() == ISD::SUB) {
1261 // sub C, x -> add (sub 0, x), C
1262 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1263 int64_t ByteOffset = C->getSExtValue();
1264 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1265 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1266
1267 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1268 // the known bits in isDSOffsetLegal. We need to emit the selected node
1269 // here, so this is thrown away.
1270 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1271 Zero, Addr.getOperand(1));
1272
1273 if (isDSOffsetLegal(Sub, ByteOffset)) {
1275 Opnds.push_back(Zero);
1276 Opnds.push_back(Addr.getOperand(1));
1277
1278 // FIXME: Select to VOP3 version for with-carry.
1279 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1280 if (Subtarget->hasAddNoCarry()) {
1281 SubOp = AMDGPU::V_SUB_U32_e64;
1282 Opnds.push_back(
1283 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1284 }
1285
1286 MachineSDNode *MachineSub =
1287 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1288
1289 Base = SDValue(MachineSub, 0);
1290 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1291 return true;
1292 }
1293 }
1294 }
1295 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1296 // If we have a constant address, prefer to put the constant into the
1297 // offset. This can save moves to load the constant address since multiple
1298 // operations can share the zero base address register, and enables merging
1299 // into read2 / write2 instructions.
1300
1301 SDLoc DL(Addr);
1302
1303 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1304 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1305 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1306 DL, MVT::i32, Zero);
1307 Base = SDValue(MovZero, 0);
1308 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1309 return true;
1310 }
1311 }
1312
1313 // default case
1314 Base = Addr;
1315 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1316 return true;
1317}
1318
1319bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1320 unsigned Offset1,
1321 unsigned Size) const {
1322 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1323 return false;
1324 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1325 return false;
1326
1327 if (!Base || Subtarget->hasUsableDSOffset() ||
1328 Subtarget->unsafeDSOffsetFoldingEnabled())
1329 return true;
1330
1331 // On Southern Islands instruction with a negative base value and an offset
1332 // don't seem to work.
1333 return CurDAG->SignBitIsZero(Base);
1334}
1335
1336// Return whether the operation has NoUnsignedWrap property.
1337static bool isNoUnsignedWrap(SDValue Addr) {
1338 return (Addr.getOpcode() == ISD::ADD &&
1339 Addr->getFlags().hasNoUnsignedWrap()) ||
1340 Addr->getOpcode() == ISD::OR;
1341}
1342
1343// Check that the base address of flat scratch load/store in the form of `base +
1344// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1345// requirement). We always treat the first operand as the base address here.
1346bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1347 if (isNoUnsignedWrap(Addr))
1348 return true;
1349
1350 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1351 // values.
1352 if (Subtarget->hasSignedScratchOffsets())
1353 return true;
1354
1355 auto LHS = Addr.getOperand(0);
1356 auto RHS = Addr.getOperand(1);
1357
1358 // If the immediate offset is negative and within certain range, the base
1359 // address cannot also be negative. If the base is also negative, the sum
1360 // would be either negative or much larger than the valid range of scratch
1361 // memory a thread can access.
1362 ConstantSDNode *ImmOp = nullptr;
1363 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1364 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1365 return true;
1366 }
1367
1368 return CurDAG->SignBitIsZero(LHS);
1369}
1370
1371// Check address value in SGPR/VGPR are legal for flat scratch in the form
1372// of: SGPR + VGPR.
1373bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1374 if (isNoUnsignedWrap(Addr))
1375 return true;
1376
1377 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1378 // values.
1379 if (Subtarget->hasSignedScratchOffsets())
1380 return true;
1381
1382 auto LHS = Addr.getOperand(0);
1383 auto RHS = Addr.getOperand(1);
1384 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1385}
1386
1387// Check address value in SGPR/VGPR are legal for flat scratch in the form
1388// of: SGPR + VGPR + Imm.
1389bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1390 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1391 // values.
1392 if (AMDGPU::isGFX12Plus(*Subtarget))
1393 return true;
1394
1395 auto Base = Addr.getOperand(0);
1396 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1397 // If the immediate offset is negative and within certain range, the base
1398 // address cannot also be negative. If the base is also negative, the sum
1399 // would be either negative or much larger than the valid range of scratch
1400 // memory a thread can access.
1401 if (isNoUnsignedWrap(Base) &&
1402 (isNoUnsignedWrap(Addr) ||
1403 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1404 return true;
1405
1406 auto LHS = Base.getOperand(0);
1407 auto RHS = Base.getOperand(1);
1408 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1409}
1410
1411// TODO: If offset is too big, put low 16-bit into offset.
1412bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1413 SDValue &Offset0,
1414 SDValue &Offset1) const {
1415 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1416}
1417
1418bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1419 SDValue &Offset0,
1420 SDValue &Offset1) const {
1421 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1422}
1423
1424bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1425 SDValue &Offset0, SDValue &Offset1,
1426 unsigned Size) const {
1427 SDLoc DL(Addr);
1428
1429 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1430 SDValue N0 = Addr.getOperand(0);
1431 SDValue N1 = Addr.getOperand(1);
1432 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1433 unsigned OffsetValue0 = C1->getZExtValue();
1434 unsigned OffsetValue1 = OffsetValue0 + Size;
1435
1436 // (add n0, c0)
1437 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1438 Base = N0;
1439 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1440 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1441 return true;
1442 }
1443 } else if (Addr.getOpcode() == ISD::SUB) {
1444 // sub C, x -> add (sub 0, x), C
1445 if (const ConstantSDNode *C =
1447 unsigned OffsetValue0 = C->getZExtValue();
1448 unsigned OffsetValue1 = OffsetValue0 + Size;
1449
1450 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1451 SDLoc DL(Addr);
1452 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1453
1454 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1455 // the known bits in isDSOffsetLegal. We need to emit the selected node
1456 // here, so this is thrown away.
1457 SDValue Sub =
1458 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1459
1460 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1462 Opnds.push_back(Zero);
1463 Opnds.push_back(Addr.getOperand(1));
1464 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1465 if (Subtarget->hasAddNoCarry()) {
1466 SubOp = AMDGPU::V_SUB_U32_e64;
1467 Opnds.push_back(
1468 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1469 }
1470
1471 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1472 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1473
1474 Base = SDValue(MachineSub, 0);
1475 Offset0 =
1476 CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1477 Offset1 =
1478 CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1479 return true;
1480 }
1481 }
1482 }
1483 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1484 unsigned OffsetValue0 = CAddr->getZExtValue();
1485 unsigned OffsetValue1 = OffsetValue0 + Size;
1486
1487 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1488 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1489 MachineSDNode *MovZero =
1490 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1491 Base = SDValue(MovZero, 0);
1492 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1493 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1494 return true;
1495 }
1496 }
1497
1498 // default case
1499
1500 Base = Addr;
1501 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
1502 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
1503 return true;
1504}
1505
1506bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1507 SDValue &SOffset, SDValue &Offset,
1508 SDValue &Offen, SDValue &Idxen,
1509 SDValue &Addr64) const {
1510 // Subtarget prefers to use flat instruction
1511 // FIXME: This should be a pattern predicate and not reach here
1512 if (Subtarget->useFlatForGlobal())
1513 return false;
1514
1515 SDLoc DL(Addr);
1516
1517 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1518 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1519 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1520 SOffset = Subtarget->hasRestrictedSOffset()
1521 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1522 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1523
1524 ConstantSDNode *C1 = nullptr;
1525 SDValue N0 = Addr;
1526 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1527 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1528 if (isUInt<32>(C1->getZExtValue()))
1529 N0 = Addr.getOperand(0);
1530 else
1531 C1 = nullptr;
1532 }
1533
1534 if (N0.getOpcode() == ISD::ADD) {
1535 // (add N2, N3) -> addr64, or
1536 // (add (add N2, N3), C1) -> addr64
1537 SDValue N2 = N0.getOperand(0);
1538 SDValue N3 = N0.getOperand(1);
1539 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1540
1541 if (N2->isDivergent()) {
1542 if (N3->isDivergent()) {
1543 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1544 // addr64, and construct the resource from a 0 address.
1545 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1546 VAddr = N0;
1547 } else {
1548 // N2 is divergent, N3 is not.
1549 Ptr = N3;
1550 VAddr = N2;
1551 }
1552 } else {
1553 // N2 is not divergent.
1554 Ptr = N2;
1555 VAddr = N3;
1556 }
1557 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1558 } else if (N0->isDivergent()) {
1559 // N0 is divergent. Use it as the addr64, and construct the resource from a
1560 // 0 address.
1561 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1562 VAddr = N0;
1563 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1564 } else {
1565 // N0 -> offset, or
1566 // (N0 + C1) -> offset
1567 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1568 Ptr = N0;
1569 }
1570
1571 if (!C1) {
1572 // No offset.
1573 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1574 return true;
1575 }
1576
1577 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1578 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1579 // Legal offset for instruction.
1580 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1581 return true;
1582 }
1583
1584 // Illegal offset, store it in soffset.
1585 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1586 SOffset =
1587 SDValue(CurDAG->getMachineNode(
1588 AMDGPU::S_MOV_B32, DL, MVT::i32,
1589 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1590 0);
1591 return true;
1592}
1593
1594bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1595 SDValue &VAddr, SDValue &SOffset,
1596 SDValue &Offset) const {
1597 SDValue Ptr, Offen, Idxen, Addr64;
1598
1599 // addr64 bit was removed for volcanic islands.
1600 // FIXME: This should be a pattern predicate and not reach here
1601 if (!Subtarget->hasAddr64())
1602 return false;
1603
1604 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1605 return false;
1606
1607 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1608 if (C->getSExtValue()) {
1609 SDLoc DL(Addr);
1610
1611 const SITargetLowering& Lowering =
1612 *static_cast<const SITargetLowering*>(getTargetLowering());
1613
1614 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1615 return true;
1616 }
1617
1618 return false;
1619}
1620
1621std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1622 SDLoc DL(N);
1623
1624 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1625 SDValue TFI =
1626 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1627
1628 // We rebase the base address into an absolute stack address and hence
1629 // use constant 0 for soffset. This value must be retained until
1630 // frame elimination and eliminateFrameIndex will choose the appropriate
1631 // frame register if need be.
1632 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1633}
1634
1635bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1636 SDValue Addr, SDValue &Rsrc,
1637 SDValue &VAddr, SDValue &SOffset,
1638 SDValue &ImmOffset) const {
1639
1640 SDLoc DL(Addr);
1641 MachineFunction &MF = CurDAG->getMachineFunction();
1642 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1643
1644 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1645
1646 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1647 int64_t Imm = CAddr->getSExtValue();
1648 const int64_t NullPtr =
1650 // Don't fold null pointer.
1651 if (Imm != NullPtr) {
1652 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1653 SDValue HighBits =
1654 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1655 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1656 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1657 VAddr = SDValue(MovHighBits, 0);
1658
1659 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1660 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1661 return true;
1662 }
1663 }
1664
1665 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1666 // (add n0, c1)
1667
1668 SDValue N0 = Addr.getOperand(0);
1669 uint64_t C1 = Addr.getConstantOperandVal(1);
1670
1671 // Offsets in vaddr must be positive if range checking is enabled.
1672 //
1673 // The total computation of vaddr + soffset + offset must not overflow. If
1674 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1675 // overflowing.
1676 //
1677 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1678 // always perform a range check. If a negative vaddr base index was used,
1679 // this would fail the range check. The overall address computation would
1680 // compute a valid address, but this doesn't happen due to the range
1681 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1682 //
1683 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1684 // MUBUF vaddr, but not on older subtargets which can only do this if the
1685 // sign bit is known 0.
1686 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1687 if (TII->isLegalMUBUFImmOffset(C1) &&
1688 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1689 CurDAG->SignBitIsZero(N0))) {
1690 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1691 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1692 return true;
1693 }
1694 }
1695
1696 // (node)
1697 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1698 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1699 return true;
1700}
1701
1702static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1703 if (Val.getOpcode() != ISD::CopyFromReg)
1704 return false;
1705 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1706 if (!Reg.isPhysical())
1707 return false;
1708 const auto *RC = TRI.getPhysRegBaseClass(Reg);
1709 return RC && TRI.isSGPRClass(RC);
1710}
1711
1712bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1713 SDValue Addr,
1714 SDValue &SRsrc,
1715 SDValue &SOffset,
1716 SDValue &Offset) const {
1717 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1718 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1719 MachineFunction &MF = CurDAG->getMachineFunction();
1720 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1721 SDLoc DL(Addr);
1722
1723 // CopyFromReg <sgpr>
1724 if (IsCopyFromSGPR(*TRI, Addr)) {
1725 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1726 SOffset = Addr;
1727 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1728 return true;
1729 }
1730
1731 ConstantSDNode *CAddr;
1732 if (Addr.getOpcode() == ISD::ADD) {
1733 // Add (CopyFromReg <sgpr>) <constant>
1734 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1735 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1736 return false;
1737 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1738 return false;
1739
1740 SOffset = Addr.getOperand(0);
1741 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1742 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1743 // <constant>
1744 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1745 } else {
1746 return false;
1747 }
1748
1749 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1750
1751 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1752 return true;
1753}
1754
1755bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1756 SDValue &SOffset, SDValue &Offset
1757 ) const {
1758 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1759 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1760
1761 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1762 return false;
1763
1764 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1765 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1766 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1767 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1768 maskTrailingOnes<uint64_t>(32); // Size
1769 SDLoc DL(Addr);
1770
1771 const SITargetLowering& Lowering =
1772 *static_cast<const SITargetLowering*>(getTargetLowering());
1773
1774 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1775 return true;
1776 }
1777 return false;
1778}
1779
1780bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1781 SDValue &SOffset) const {
1782 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1783 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1784 return true;
1785 }
1786
1787 SOffset = ByteOffsetNode;
1788 return true;
1789}
1790
1791// Find a load or store from corresponding pattern root.
1792// Roots may be build_vector, bitconvert or their combinations.
1795 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1796 return MN;
1798 for (SDValue V : N->op_values())
1799 if (MemSDNode *MN =
1801 return MN;
1802 llvm_unreachable("cannot find MemSDNode in the pattern!");
1803}
1804
1805bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1806 SDValue &VAddr, SDValue &Offset,
1807 uint64_t FlatVariant) const {
1808 int64_t OffsetVal = 0;
1809
1810 unsigned AS = findMemSDNode(N)->getAddressSpace();
1811
1812 bool CanHaveFlatSegmentOffsetBug =
1813 Subtarget->hasFlatSegmentOffsetBug() &&
1814 FlatVariant == SIInstrFlags::FLAT &&
1816
1817 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1818 SDValue N0, N1;
1819 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1820 (FlatVariant != SIInstrFlags::FlatScratch ||
1821 isFlatScratchBaseLegal(Addr))) {
1822 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1823
1824 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1825 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1826 Addr = N0;
1827 OffsetVal = COffsetVal;
1828 } else {
1829 // If the offset doesn't fit, put the low bits into the offset field and
1830 // add the rest.
1831 //
1832 // For a FLAT instruction the hardware decides whether to access
1833 // global/scratch/shared memory based on the high bits of vaddr,
1834 // ignoring the offset field, so we have to ensure that when we add
1835 // remainder to vaddr it still points into the same underlying object.
1836 // The easiest way to do that is to make sure that we split the offset
1837 // into two pieces that are both >= 0 or both <= 0.
1838
1839 SDLoc DL(N);
1840 uint64_t RemainderOffset;
1841
1842 std::tie(OffsetVal, RemainderOffset) =
1843 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1844
1845 SDValue AddOffsetLo =
1846 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1847 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1848
1849 if (Addr.getValueType().getSizeInBits() == 32) {
1851 Opnds.push_back(N0);
1852 Opnds.push_back(AddOffsetLo);
1853 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1854 if (Subtarget->hasAddNoCarry()) {
1855 AddOp = AMDGPU::V_ADD_U32_e64;
1856 Opnds.push_back(Clamp);
1857 }
1858 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1859 } else {
1860 // TODO: Should this try to use a scalar add pseudo if the base address
1861 // is uniform and saddr is usable?
1862 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1863 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1864
1865 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1866 DL, MVT::i32, N0, Sub0);
1867 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1868 DL, MVT::i32, N0, Sub1);
1869
1870 SDValue AddOffsetHi =
1871 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1872
1873 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1874
1875 SDNode *Add =
1876 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1877 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1878
1879 SDNode *Addc = CurDAG->getMachineNode(
1880 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1881 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1882
1883 SDValue RegSequenceArgs[] = {
1884 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1885 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1886
1887 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1888 MVT::i64, RegSequenceArgs),
1889 0);
1890 }
1891 }
1892 }
1893 }
1894
1895 VAddr = Addr;
1896 Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1897 return true;
1898}
1899
1900bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1901 SDValue &VAddr,
1902 SDValue &Offset) const {
1903 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1904}
1905
1906bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1907 SDValue &VAddr,
1908 SDValue &Offset) const {
1909 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1910}
1911
1912bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1913 SDValue &VAddr,
1914 SDValue &Offset) const {
1915 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1917}
1918
1919// If this matches *_extend i32:x, return x
1920// Otherwise if the value is I32 returns x.
1922 const SelectionDAG *DAG) {
1923 if (Op.getValueType() == MVT::i32)
1924 return Op;
1925
1926 if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
1927 Op.getOpcode() != ISD::ANY_EXTEND &&
1928 !(DAG->SignBitIsZero(Op) &&
1929 Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
1930 return SDValue();
1931
1932 SDValue ExtSrc = Op.getOperand(0);
1933 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1934}
1935
1936// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1937// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
1938bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
1939 SDValue &SAddr, SDValue &VOffset,
1940 SDValue &Offset, bool &ScaleOffset,
1941 bool NeedIOffset) const {
1942 int64_t ImmOffset = 0;
1943 ScaleOffset = false;
1944
1945 // Match the immediate offset first, which canonically is moved as low as
1946 // possible.
1947
1948 SDValue LHS, RHS;
1949 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1950 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1951 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1952
1953 if (NeedIOffset &&
1954 TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1956 Addr = LHS;
1957 ImmOffset = COffsetVal;
1958 } else if (!LHS->isDivergent()) {
1959 if (COffsetVal > 0) {
1960 SDLoc SL(N);
1961 // saddr + large_offset -> saddr +
1962 // (voffset = large_offset & ~MaxOffset) +
1963 // (large_offset & MaxOffset);
1964 int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;
1965 if (NeedIOffset) {
1966 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1968 }
1969
1970 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
1971 : isUInt<32>(RemainderOffset)) {
1972 SDNode *VMov = CurDAG->getMachineNode(
1973 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1974 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1975 VOffset = SDValue(VMov, 0);
1976 SAddr = LHS;
1977 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1978 return true;
1979 }
1980 }
1981
1982 // We are adding a 64 bit SGPR and a constant. If constant bus limit
1983 // is 1 we would need to perform 1 or 2 extra moves for each half of
1984 // the constant and it is better to do a scalar add and then issue a
1985 // single VALU instruction to materialize zero. Otherwise it is less
1986 // instructions to perform VALU adds with immediates or inline literals.
1987 unsigned NumLiterals =
1988 !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) +
1989 !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal)));
1990 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1991 return false;
1992 }
1993 }
1994
1995 // Match the variable offset.
1996 if (Addr.getOpcode() == ISD::ADD) {
1997 LHS = Addr.getOperand(0);
1998
1999 if (!LHS->isDivergent()) {
2000 // add (i64 sgpr), (*_extend (i32 vgpr))
2001 RHS = Addr.getOperand(1);
2002 ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());
2003 if (SDValue ExtRHS = matchExtFromI32orI32(
2004 RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2005 SAddr = LHS;
2006 VOffset = ExtRHS;
2007 }
2008 }
2009
2010 RHS = Addr.getOperand(1);
2011 if (!SAddr && !RHS->isDivergent()) {
2012 // add (*_extend (i32 vgpr)), (i64 sgpr)
2013 ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset());
2014 if (SDValue ExtLHS = matchExtFromI32orI32(
2015 LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2016 SAddr = RHS;
2017 VOffset = ExtLHS;
2018 }
2019 }
2020
2021 if (SAddr) {
2022 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2023 return true;
2024 }
2025 }
2026
2027 if (Subtarget->hasScaleOffset() &&
2028 (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
2031 (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
2032 CurDAG->SignBitIsZero(Addr.getOperand(0)))) &&
2033 Addr.getOperand(0)->isDivergent() &&
2035 !Addr.getOperand(2)->isDivergent()) {
2036 // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
2037 unsigned Size =
2038 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2039 ScaleOffset = Addr.getConstantOperandVal(1) == Size;
2040 if (ScaleOffset) {
2041 SAddr = Addr.getOperand(2);
2042 VOffset = Addr.getOperand(0);
2043 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2044 return true;
2045 }
2046 }
2047
2048 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
2049 isa<ConstantSDNode>(Addr))
2050 return false;
2051
2052 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
2053 // moves required to copy a 64-bit SGPR to VGPR.
2054 SAddr = Addr;
2055 SDNode *VMov =
2056 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
2057 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
2058 VOffset = SDValue(VMov, 0);
2059 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2060 return true;
2061}
2062
2063bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
2064 SDValue &SAddr, SDValue &VOffset,
2065 SDValue &Offset,
2066 SDValue &CPol) const {
2067 bool ScaleOffset;
2068 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2069 return false;
2070
2071 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2072 SDLoc(), MVT::i32);
2073 return true;
2074}
2075
2076bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
2077 SDValue &SAddr, SDValue &VOffset,
2078 SDValue &Offset,
2079 SDValue &CPol) const {
2080 bool ScaleOffset;
2081 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2082 return false;
2083
2084 // We are assuming CPol is always the last operand of the intrinsic.
2085 auto PassedCPol =
2086 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2087 CPol = CurDAG->getTargetConstant(
2088 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2089 return true;
2090}
2091
2092bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,
2093 SDValue &SAddr,
2094 SDValue &VOffset,
2095 SDValue &Offset,
2096 SDValue &CPol) const {
2097 bool ScaleOffset;
2098 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2099 return false;
2100
2101 // We are assuming CPol is second from last operand of the intrinsic.
2102 auto PassedCPol =
2103 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2104 CPol = CurDAG->getTargetConstant(
2105 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2106 return true;
2107}
2108
2109bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
2110 SDValue &SAddr, SDValue &VOffset,
2111 SDValue &Offset,
2112 SDValue &CPol) const {
2113 bool ScaleOffset;
2114 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2115 return false;
2116
2117 unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;
2118 CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);
2119 return true;
2120}
2121
2122bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
2123 SDValue &SAddr,
2124 SDValue &VOffset,
2125 SDValue &CPol) const {
2126 bool ScaleOffset;
2127 SDValue DummyOffset;
2128 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2129 false))
2130 return false;
2131
2132 // We are assuming CPol is always the last operand of the intrinsic.
2133 auto PassedCPol =
2134 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2135 CPol = CurDAG->getTargetConstant(
2136 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2137 return true;
2138}
2139
2140bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr,
2141 SDValue &SAddr,
2142 SDValue &VOffset,
2143 SDValue &CPol) const {
2144 bool ScaleOffset;
2145 SDValue DummyOffset;
2146 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2147 false))
2148 return false;
2149
2150 // We are assuming CPol is second from last operand of the intrinsic.
2151 auto PassedCPol =
2152 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2153 CPol = CurDAG->getTargetConstant(
2154 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2155 return true;
2156}
2157
2159 if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
2160 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
2161 } else if (SAddr.getOpcode() == ISD::ADD &&
2163 // Materialize this into a scalar move for scalar address to avoid
2164 // readfirstlane.
2165 auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
2166 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
2167 FI->getValueType(0));
2168 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
2169 MVT::i32, TFI, SAddr.getOperand(1)),
2170 0);
2171 }
2172
2173 return SAddr;
2174}
2175
2176// Match (32-bit SGPR base) + sext(imm offset)
2177bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
2178 SDValue &SAddr,
2179 SDValue &Offset) const {
2180 if (Addr->isDivergent())
2181 return false;
2182
2183 SDLoc DL(Addr);
2184
2185 int64_t COffsetVal = 0;
2186
2187 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
2188 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
2189 SAddr = Addr.getOperand(0);
2190 } else {
2191 SAddr = Addr;
2192 }
2193
2194 SAddr = SelectSAddrFI(CurDAG, SAddr);
2195
2196 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2197
2198 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2200 int64_t SplitImmOffset, RemainderOffset;
2201 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2203
2204 COffsetVal = SplitImmOffset;
2205
2206 SDValue AddOffset =
2208 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
2209 : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);
2210 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
2211 SAddr, AddOffset),
2212 0);
2213 }
2214
2215 Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);
2216
2217 return true;
2218}
2219
2220// Check whether the flat scratch SVS swizzle bug affects this access.
2221bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
2222 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
2223 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
2224 return false;
2225
2226 // The bug affects the swizzling of SVS accesses if there is any carry out
2227 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
2228 // voffset to (soffset + inst_offset).
2229 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
2230 KnownBits SKnown =
2231 KnownBits::add(CurDAG->computeKnownBits(SAddr),
2232 KnownBits::makeConstant(APInt(32, ImmOffset,
2233 /*isSigned=*/true)));
2234 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
2235 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
2236 return (VMax & 3) + (SMax & 3) >= 4;
2237}
2238
2239bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
2240 SDValue &VAddr, SDValue &SAddr,
2241 SDValue &Offset,
2242 SDValue &CPol) const {
2243 int64_t ImmOffset = 0;
2244
2245 SDValue LHS, RHS;
2246 SDValue OrigAddr = Addr;
2247 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2248 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2249 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2250
2251 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2253 Addr = LHS;
2254 ImmOffset = COffsetVal;
2255 } else if (!LHS->isDivergent() && COffsetVal > 0) {
2256 SDLoc SL(N);
2257 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
2258 // (large_offset & MaxOffset);
2259 int64_t SplitImmOffset, RemainderOffset;
2260 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2262
2263 if (isUInt<32>(RemainderOffset)) {
2264 SDNode *VMov = CurDAG->getMachineNode(
2265 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2266 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2267 VAddr = SDValue(VMov, 0);
2268 SAddr = LHS;
2269 if (!isFlatScratchBaseLegal(Addr))
2270 return false;
2271 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
2272 return false;
2273 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2274 CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2275 return true;
2276 }
2277 }
2278 }
2279
2280 if (Addr.getOpcode() != ISD::ADD)
2281 return false;
2282
2283 LHS = Addr.getOperand(0);
2284 RHS = Addr.getOperand(1);
2285
2286 if (!LHS->isDivergent() && RHS->isDivergent()) {
2287 SAddr = LHS;
2288 VAddr = RHS;
2289 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
2290 SAddr = RHS;
2291 VAddr = LHS;
2292 } else {
2293 return false;
2294 }
2295
2296 if (OrigAddr != Addr) {
2297 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
2298 return false;
2299 } else {
2300 if (!isFlatScratchBaseLegalSV(OrigAddr))
2301 return false;
2302 }
2303
2304 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2305 return false;
2306 SAddr = SelectSAddrFI(CurDAG, SAddr);
2307 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2308
2309 bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */);
2310 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2311 SDLoc(), MVT::i32);
2312 return true;
2313}
2314
2315// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2316// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2317// Handle the case where the Immediate Offset + SOffset is negative.
2318bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2319 bool Imm32Only,
2320 bool IsBuffer,
2321 int64_t ImmOffset) const {
2322 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2323 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2324 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2325 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2326 return false;
2327 }
2328
2329 return true;
2330}
2331
2332// Given \p Offset and load node \p N check if an \p Offset is a multiple of
2333// the load byte size. If it is update \p Offset to a pre-scaled value and
2334// return true.
2335bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
2336 bool IsSigned) const {
2337 bool ScaleOffset = false;
2338 if (!Subtarget->hasScaleOffset() || !Offset)
2339 return false;
2340
2341 unsigned Size =
2342 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2343
2344 SDValue Off = Offset;
2345 if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG))
2346 Off = Ext;
2347
2348 if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) {
2349 if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1)))
2350 ScaleOffset = C->getZExtValue() == Log2_32(Size);
2351 } else if (Offset.getOpcode() == ISD::MUL ||
2352 (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||
2353 Offset.getOpcode() == AMDGPUISD::MUL_U24 ||
2354 (Offset.isMachineOpcode() &&
2355 Offset.getMachineOpcode() ==
2356 (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
2357 : AMDGPU::S_MUL_U64_U32_PSEUDO))) {
2358 if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1)))
2359 ScaleOffset = C->getZExtValue() == Size;
2360 }
2361
2362 if (ScaleOffset)
2363 Offset = Off.getOperand(0);
2364
2365 return ScaleOffset;
2366}
2367
2368// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2369// not null) offset. If Imm32Only is true, match only 32-bit immediate
2370// offsets available on CI.
2371bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
2372 SDValue *SOffset, SDValue *Offset,
2373 bool Imm32Only, bool IsBuffer,
2374 bool HasSOffset, int64_t ImmOffset,
2375 bool *ScaleOffset) const {
2376 assert((!SOffset || !Offset) &&
2377 "Cannot match both soffset and offset at the same time!");
2378
2379 if (ScaleOffset) {
2380 assert(N && SOffset);
2381
2382 *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */);
2383 }
2384
2385 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2386 if (!C) {
2387 if (!SOffset)
2388 return false;
2389
2390 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2391 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2392 *SOffset = ByteOffsetNode;
2393 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2394 ImmOffset);
2395 }
2396 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2397 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2398 *SOffset = ByteOffsetNode.getOperand(0);
2399 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2400 ImmOffset);
2401 }
2402 }
2403 return false;
2404 }
2405
2406 SDLoc SL(ByteOffsetNode);
2407
2408 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2409 // offset for S_BUFFER instructions is unsigned.
2410 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2411 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2412 *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2413 if (EncodedOffset && Offset && !Imm32Only) {
2414 *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);
2415 return true;
2416 }
2417
2418 // SGPR and literal offsets are unsigned.
2419 if (ByteOffset < 0)
2420 return false;
2421
2422 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2423 if (EncodedOffset && Offset && Imm32Only) {
2424 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2425 return true;
2426 }
2427
2428 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2429 return false;
2430
2431 if (SOffset) {
2432 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2433 *SOffset = SDValue(
2434 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2435 return true;
2436 }
2437
2438 return false;
2439}
2440
2441SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2442 if (Addr.getValueType() != MVT::i32)
2443 return Addr;
2444
2445 // Zero-extend a 32-bit address.
2446 SDLoc SL(Addr);
2447
2448 const MachineFunction &MF = CurDAG->getMachineFunction();
2449 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2450 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2451 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2452
2453 const SDValue Ops[] = {
2454 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2455 Addr,
2456 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2457 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2458 0),
2459 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2460 };
2461
2462 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2463 Ops), 0);
2464}
2465
2466// Match a base and an immediate (if Offset is not null) or an SGPR (if
2467// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2468// true, match only 32-bit immediate offsets available on CI.
2469bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
2470 SDValue &SBase, SDValue *SOffset,
2471 SDValue *Offset, bool Imm32Only,
2472 bool IsBuffer, bool HasSOffset,
2473 int64_t ImmOffset,
2474 bool *ScaleOffset) const {
2475 if (SOffset && Offset) {
2476 assert(!Imm32Only && !IsBuffer);
2477 SDValue B;
2478
2479 if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true))
2480 return false;
2481
2482 int64_t ImmOff = 0;
2483 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2484 ImmOff = C->getSExtValue();
2485
2486 return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false,
2487 true, ImmOff, ScaleOffset);
2488 }
2489
2490 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2491 // wraparound, because s_load instructions perform the addition in 64 bits.
2492 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2493 !Addr->getFlags().hasNoUnsignedWrap())
2494 return false;
2495
2496 SDValue N0, N1;
2497 // Extract the base and offset if possible.
2498 if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2499 N0 = Addr.getOperand(0);
2500 N1 = Addr.getOperand(1);
2501 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2502 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2503 }
2504 if (!N0 || !N1)
2505 return false;
2506
2507 if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2508 ImmOffset, ScaleOffset)) {
2509 SBase = N0;
2510 return true;
2511 }
2512 if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2513 ImmOffset, ScaleOffset)) {
2514 SBase = N1;
2515 return true;
2516 }
2517 return false;
2518}
2519
2520bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
2521 SDValue *SOffset, SDValue *Offset,
2522 bool Imm32Only, bool *ScaleOffset) const {
2523 if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
2524 /* IsBuffer */ false, /* HasSOffset */ false,
2525 /* ImmOffset */ 0, ScaleOffset)) {
2526 SBase = Expand32BitAddress(SBase);
2527 return true;
2528 }
2529
2530 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2531 SBase = Expand32BitAddress(Addr);
2532 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2533 return true;
2534 }
2535
2536 return false;
2537}
2538
2539bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2540 SDValue &Offset) const {
2541 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2542 &Offset);
2543}
2544
2545bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2546 SDValue &Offset) const {
2547 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2548 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2549 &Offset, /* Imm32Only */ true);
2550}
2551
2552bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
2553 SDValue &SOffset, SDValue &CPol) const {
2554 bool ScaleOffset;
2555 if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr,
2556 /* Imm32Only */ false, &ScaleOffset))
2557 return false;
2558
2559 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2560 SDLoc(N), MVT::i32);
2561 return true;
2562}
2563
2564bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
2565 SDValue &SBase, SDValue &SOffset,
2566 SDValue &Offset,
2567 SDValue &CPol) const {
2568 bool ScaleOffset;
2569 if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset))
2570 return false;
2571
2572 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2573 SDLoc(N), MVT::i32);
2574 return true;
2575}
2576
2577bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2578 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2579 /* Imm32Only */ false, /* IsBuffer */ true);
2580}
2581
2582bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2583 SDValue &Offset) const {
2584 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2585 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2586 /* Imm32Only */ true, /* IsBuffer */ true);
2587}
2588
2589bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2590 SDValue &Offset) const {
2591 // Match the (soffset + offset) pair as a 32-bit register base and
2592 // an immediate offset.
2593 return N.getValueType() == MVT::i32 &&
2594 SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset,
2595 /* SOffset*/ nullptr, &Offset,
2596 /* Imm32Only */ false, /* IsBuffer */ true);
2597}
2598
2599bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2600 SDValue &Base,
2601 SDValue &Offset) const {
2602 SDLoc DL(Index);
2603
2604 if (CurDAG->isBaseWithConstantOffset(Index)) {
2605 SDValue N0 = Index.getOperand(0);
2606 SDValue N1 = Index.getOperand(1);
2607 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2608
2609 // (add n0, c0)
2610 // Don't peel off the offset (c0) if doing so could possibly lead
2611 // the base (n0) to be negative.
2612 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2613 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2614 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2615 Base = N0;
2616 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2617 return true;
2618 }
2619 }
2620
2621 if (isa<ConstantSDNode>(Index))
2622 return false;
2623
2624 Base = Index;
2625 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2626 return true;
2627}
2628
2629SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2630 SDValue Val, uint32_t Offset,
2631 uint32_t Width) {
2632 if (Val->isDivergent()) {
2633 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2634 SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
2635 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2636
2637 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2638 }
2639 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2640 // Transformation function, pack the offset and width of a BFE into
2641 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2642 // source, bits [5:0] contain the offset and bits [22:16] the width.
2643 uint32_t PackedVal = Offset | (Width << 16);
2644 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2645
2646 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2647}
2648
2649void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2650 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2651 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2652 // Predicate: 0 < b <= c < 32
2653
2654 const SDValue &Shl = N->getOperand(0);
2655 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2656 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2657
2658 if (B && C) {
2659 uint32_t BVal = B->getZExtValue();
2660 uint32_t CVal = C->getZExtValue();
2661
2662 if (0 < BVal && BVal <= CVal && CVal < 32) {
2663 bool Signed = N->getOpcode() == ISD::SRA;
2664 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2665 32 - CVal));
2666 return;
2667 }
2668 }
2669 SelectCode(N);
2670}
2671
2672void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2673 switch (N->getOpcode()) {
2674 case ISD::AND:
2675 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2676 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2677 // Predicate: isMask(mask)
2678 const SDValue &Srl = N->getOperand(0);
2679 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2680 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2681
2682 if (Shift && Mask) {
2683 uint32_t ShiftVal = Shift->getZExtValue();
2684 uint32_t MaskVal = Mask->getZExtValue();
2685
2686 if (isMask_32(MaskVal)) {
2687 uint32_t WidthVal = llvm::popcount(MaskVal);
2688 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2689 WidthVal));
2690 return;
2691 }
2692 }
2693 }
2694 break;
2695 case ISD::SRL:
2696 if (N->getOperand(0).getOpcode() == ISD::AND) {
2697 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2698 // Predicate: isMask(mask >> b)
2699 const SDValue &And = N->getOperand(0);
2700 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2701 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2702
2703 if (Shift && Mask) {
2704 uint32_t ShiftVal = Shift->getZExtValue();
2705 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2706
2707 if (isMask_32(MaskVal)) {
2708 uint32_t WidthVal = llvm::popcount(MaskVal);
2709 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2710 WidthVal));
2711 return;
2712 }
2713 }
2714 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2715 SelectS_BFEFromShifts(N);
2716 return;
2717 }
2718 break;
2719 case ISD::SRA:
2720 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2721 SelectS_BFEFromShifts(N);
2722 return;
2723 }
2724 break;
2725
2727 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2728 SDValue Src = N->getOperand(0);
2729 if (Src.getOpcode() != ISD::SRL)
2730 break;
2731
2732 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2733 if (!Amt)
2734 break;
2735
2736 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2737 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2738 Amt->getZExtValue(), Width));
2739 return;
2740 }
2741 }
2742
2743 SelectCode(N);
2744}
2745
2746bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2747 assert(N->getOpcode() == ISD::BRCOND);
2748 if (!N->hasOneUse())
2749 return false;
2750
2751 SDValue Cond = N->getOperand(1);
2752 if (Cond.getOpcode() == ISD::CopyToReg)
2753 Cond = Cond.getOperand(2);
2754
2755 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2756 return false;
2757
2758 MVT VT = Cond.getOperand(0).getSimpleValueType();
2759 if (VT == MVT::i32)
2760 return true;
2761
2762 if (VT == MVT::i64) {
2763 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2764 return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2765 Subtarget->hasScalarCompareEq64();
2766 }
2767
2768 if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2769 return true;
2770
2771 return false;
2772}
2773
2774static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2775 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2776 // Special case for amdgcn.ballot:
2777 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2778 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2779 // =>
2780 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2781 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2782 // Cond becomes a i(WaveSize) full mask value.
2783 // Note that ballot doesn't use SETEQ condition but its easy to support it
2784 // here for completeness, so in this case Negate is set true on return.
2785 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2786 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2787 isNullConstant(VCMP.getOperand(1))) {
2788
2789 auto Cond = VCMP.getOperand(0);
2790 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2791 Cond = Cond.getOperand(0);
2792
2793 if (isBoolSGPR(Cond)) {
2794 Negate = VCMP_CC == ISD::SETEQ;
2795 return Cond;
2796 }
2797 }
2798 return SDValue();
2799}
2800
2801void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2802 SDValue Cond = N->getOperand(1);
2803
2804 if (Cond.isUndef()) {
2805 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2806 N->getOperand(2), N->getOperand(0));
2807 return;
2808 }
2809
2810 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2811
2812 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2813 bool AndExec = !UseSCCBr;
2814 bool Negate = false;
2815
2816 if (Cond.getOpcode() == ISD::SETCC &&
2817 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2818 SDValue VCMP = Cond->getOperand(0);
2819 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2820 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2821 isNullConstant(Cond->getOperand(1)) &&
2822 // We may encounter ballot.i64 in wave32 mode on -O0.
2823 VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2824 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2825 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2826 // BRCOND i1 %C, %BB
2827 // =>
2828 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2829 // VCC = COPY i(WaveSize) %VCMP
2830 // S_CBRANCH_VCCNZ/VCCZ %BB
2831 Negate = CC == ISD::SETEQ;
2832 bool NegatedBallot = false;
2833 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2834 Cond = BallotCond;
2835 UseSCCBr = !BallotCond->isDivergent();
2836 Negate = Negate ^ NegatedBallot;
2837 } else {
2838 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2839 // selected as V_CMP, but this may change for uniform condition.
2840 Cond = VCMP;
2841 UseSCCBr = false;
2842 }
2843 }
2844 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2845 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2846 // used.
2847 AndExec = false;
2848 }
2849
2850 unsigned BrOp =
2851 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2852 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2853 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2854 SDLoc SL(N);
2855
2856 if (AndExec) {
2857 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2858 // analyzed what generates the vcc value, so we do not know whether vcc
2859 // bits for disabled lanes are 0. Thus we need to mask out bits for
2860 // disabled lanes.
2861 //
2862 // For the case that we select S_CBRANCH_SCC1 and it gets
2863 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2864 // SIInstrInfo::moveToVALU which inserts the S_AND).
2865 //
2866 // We could add an analysis of what generates the vcc value here and omit
2867 // the S_AND when is unnecessary. But it would be better to add a separate
2868 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2869 // catches both cases.
2870 Cond = SDValue(
2871 CurDAG->getMachineNode(
2872 Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,
2873 MVT::i1,
2874 CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO
2875 : AMDGPU::EXEC,
2876 MVT::i1),
2877 Cond),
2878 0);
2879 }
2880
2881 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2882 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2883 N->getOperand(2), // Basic Block
2884 VCC.getValue(0));
2885}
2886
2887void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2888 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2889 !N->isDivergent()) {
2890 SDValue Src = N->getOperand(0);
2891 if (Src.getValueType() == MVT::f16) {
2892 if (isExtractHiElt(Src, Src)) {
2893 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2894 {Src});
2895 return;
2896 }
2897 }
2898 }
2899
2900 SelectCode(N);
2901}
2902
2903void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2904 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2905 // be copied to an SGPR with readfirstlane.
2906 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2907 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2908
2909 SDValue Chain = N->getOperand(0);
2910 SDValue Ptr = N->getOperand(2);
2911 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2912 MachineMemOperand *MMO = M->getMemOperand();
2913 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2914
2916 if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2917 SDValue PtrBase = Ptr.getOperand(0);
2918 SDValue PtrOffset = Ptr.getOperand(1);
2919
2920 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2921 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2922 N = glueCopyToM0(N, PtrBase);
2923 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2924 }
2925 }
2926
2927 if (!Offset) {
2928 N = glueCopyToM0(N, Ptr);
2929 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2930 }
2931
2932 SDValue Ops[] = {
2933 Offset,
2934 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2935 Chain,
2936 N->getOperand(N->getNumOperands() - 1) // New glue
2937 };
2938
2939 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2940 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2941}
2942
2943// We need to handle this here because tablegen doesn't support matching
2944// instructions with multiple outputs.
2945void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
2946 unsigned Opc;
2947 switch (IntrID) {
2948 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2949 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2950 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2951 break;
2952 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2953 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2954 break;
2955 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2956 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2957 break;
2958 }
2959 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2960 N->getOperand(5), N->getOperand(0)};
2961
2962 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2963 MachineMemOperand *MMO = M->getMemOperand();
2964 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2965 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2966}
2967
2968static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2969 switch (IntrID) {
2970 case Intrinsic::amdgcn_ds_gws_init:
2971 return AMDGPU::DS_GWS_INIT;
2972 case Intrinsic::amdgcn_ds_gws_barrier:
2973 return AMDGPU::DS_GWS_BARRIER;
2974 case Intrinsic::amdgcn_ds_gws_sema_v:
2975 return AMDGPU::DS_GWS_SEMA_V;
2976 case Intrinsic::amdgcn_ds_gws_sema_br:
2977 return AMDGPU::DS_GWS_SEMA_BR;
2978 case Intrinsic::amdgcn_ds_gws_sema_p:
2979 return AMDGPU::DS_GWS_SEMA_P;
2980 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2981 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2982 default:
2983 llvm_unreachable("not a gws intrinsic");
2984 }
2985}
2986
2987void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2988 if (!Subtarget->hasGWS() ||
2989 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2990 !Subtarget->hasGWSSemaReleaseAll())) {
2991 // Let this error.
2992 SelectCode(N);
2993 return;
2994 }
2995
2996 // Chain, intrinsic ID, vsrc, offset
2997 const bool HasVSrc = N->getNumOperands() == 4;
2998 assert(HasVSrc || N->getNumOperands() == 3);
2999
3000 SDLoc SL(N);
3001 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
3002 int ImmOffset = 0;
3003 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3004 MachineMemOperand *MMO = M->getMemOperand();
3005
3006 // Don't worry if the offset ends up in a VGPR. Only one lane will have
3007 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
3008
3009 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
3010 // offset field) % 64. Some versions of the programming guide omit the m0
3011 // part, or claim it's from offset 0.
3012 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
3013 // If we have a constant offset, try to use the 0 in m0 as the base.
3014 // TODO: Look into changing the default m0 initialization value. If the
3015 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
3016 // the immediate offset.
3017 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
3018 ImmOffset = ConstOffset->getZExtValue();
3019 } else {
3020 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
3021 ImmOffset = BaseOffset.getConstantOperandVal(1);
3022 BaseOffset = BaseOffset.getOperand(0);
3023 }
3024
3025 // Prefer to do the shift in an SGPR since it should be possible to use m0
3026 // as the result directly. If it's already an SGPR, it will be eliminated
3027 // later.
3028 SDNode *SGPROffset
3029 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
3030 BaseOffset);
3031 // Shift to offset in m0
3032 SDNode *M0Base
3033 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3034 SDValue(SGPROffset, 0),
3035 CurDAG->getTargetConstant(16, SL, MVT::i32));
3036 glueCopyToM0(N, SDValue(M0Base, 0));
3037 }
3038
3039 SDValue Chain = N->getOperand(0);
3040 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
3041
3042 const unsigned Opc = gwsIntrinToOpcode(IntrID);
3044 if (HasVSrc)
3045 Ops.push_back(N->getOperand(2));
3046 Ops.push_back(OffsetField);
3047 Ops.push_back(Chain);
3048
3049 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3050 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3051}
3052
3053void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
3054 if (Subtarget->getLDSBankCount() != 16) {
3055 // This is a single instruction with a pattern.
3056 SelectCode(N);
3057 return;
3058 }
3059
3060 SDLoc DL(N);
3061
3062 // This requires 2 instructions. It is possible to write a pattern to support
3063 // this, but the generated isel emitter doesn't correctly deal with multiple
3064 // output instructions using the same physical register input. The copy to m0
3065 // is incorrectly placed before the second instruction.
3066 //
3067 // TODO: Match source modifiers.
3068 //
3069 // def : Pat <
3070 // (int_amdgcn_interp_p1_f16
3071 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
3072 // (i32 timm:$attrchan), (i32 timm:$attr),
3073 // (i1 timm:$high), M0),
3074 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
3075 // timm:$attrchan, 0,
3076 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
3077 // let Predicates = [has16BankLDS];
3078 // }
3079
3080 // 16 bank LDS
3081 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
3082 N->getOperand(5), SDValue());
3083
3084 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
3085
3086 SDNode *InterpMov =
3087 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
3088 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
3089 N->getOperand(3), // Attr
3090 N->getOperand(2), // Attrchan
3091 ToM0.getValue(1) // In glue
3092 });
3093
3094 SDNode *InterpP1LV =
3095 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
3096 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
3097 N->getOperand(1), // Src0
3098 N->getOperand(3), // Attr
3099 N->getOperand(2), // Attrchan
3100 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
3101 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
3102 N->getOperand(4), // high
3103 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
3104 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
3105 SDValue(InterpMov, 1)
3106 });
3107
3108 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
3109}
3110
3111void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
3112 unsigned IntrID = N->getConstantOperandVal(1);
3113 switch (IntrID) {
3114 case Intrinsic::amdgcn_ds_append:
3115 case Intrinsic::amdgcn_ds_consume: {
3116 if (N->getValueType(0) != MVT::i32)
3117 break;
3118 SelectDSAppendConsume(N, IntrID);
3119 return;
3120 }
3121 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3122 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3123 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3124 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3125 SelectDSBvhStackIntrinsic(N, IntrID);
3126 return;
3127 case Intrinsic::amdgcn_init_whole_wave:
3128 CurDAG->getMachineFunction()
3129 .getInfo<SIMachineFunctionInfo>()
3130 ->setInitWholeWave();
3131 break;
3132 }
3133
3134 SelectCode(N);
3135}
3136
3137void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
3138 unsigned IntrID = N->getConstantOperandVal(0);
3139 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
3140 SDNode *ConvGlueNode = N->getGluedNode();
3141 if (ConvGlueNode) {
3142 // FIXME: Possibly iterate over multiple glue nodes?
3143 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
3144 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
3145 ConvGlueNode =
3146 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
3147 MVT::Glue, SDValue(ConvGlueNode, 0));
3148 } else {
3149 ConvGlueNode = nullptr;
3150 }
3151 switch (IntrID) {
3152 case Intrinsic::amdgcn_wqm:
3153 Opcode = AMDGPU::WQM;
3154 break;
3155 case Intrinsic::amdgcn_softwqm:
3156 Opcode = AMDGPU::SOFT_WQM;
3157 break;
3158 case Intrinsic::amdgcn_wwm:
3159 case Intrinsic::amdgcn_strict_wwm:
3160 Opcode = AMDGPU::STRICT_WWM;
3161 break;
3162 case Intrinsic::amdgcn_strict_wqm:
3163 Opcode = AMDGPU::STRICT_WQM;
3164 break;
3165 case Intrinsic::amdgcn_interp_p1_f16:
3166 SelectInterpP1F16(N);
3167 return;
3168 case Intrinsic::amdgcn_permlane16_swap:
3169 case Intrinsic::amdgcn_permlane32_swap: {
3170 if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
3171 !Subtarget->hasPermlane16Swap()) ||
3172 (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3173 !Subtarget->hasPermlane32Swap())) {
3174 SelectCode(N); // Hit the default error
3175 return;
3176 }
3177
3178 Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3179 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3180 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3181
3182 SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
3183 if (ConvGlueNode)
3184 NewOps.push_back(SDValue(ConvGlueNode, 0));
3185
3186 bool FI = N->getConstantOperandVal(3);
3187 NewOps[2] = CurDAG->getTargetConstant(
3188 FI ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0, SDLoc(), MVT::i32);
3189
3190 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps);
3191 return;
3192 }
3193 default:
3194 SelectCode(N);
3195 break;
3196 }
3197
3198 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
3199 SDValue Src = N->getOperand(1);
3200 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
3201 }
3202
3203 if (ConvGlueNode) {
3204 SmallVector<SDValue, 4> NewOps(N->ops());
3205 NewOps.push_back(SDValue(ConvGlueNode, 0));
3206 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
3207 }
3208}
3209
3210void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
3211 unsigned IntrID = N->getConstantOperandVal(1);
3212 switch (IntrID) {
3213 case Intrinsic::amdgcn_ds_gws_init:
3214 case Intrinsic::amdgcn_ds_gws_barrier:
3215 case Intrinsic::amdgcn_ds_gws_sema_v:
3216 case Intrinsic::amdgcn_ds_gws_sema_br:
3217 case Intrinsic::amdgcn_ds_gws_sema_p:
3218 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3219 SelectDS_GWS(N, IntrID);
3220 return;
3221 default:
3222 break;
3223 }
3224
3225 SelectCode(N);
3226}
3227
3228void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
3229 SDValue Log2WaveSize =
3230 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
3231 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
3232 {N->getOperand(0), Log2WaveSize});
3233}
3234
3235void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
3236 SDValue SrcVal = N->getOperand(1);
3237 if (SrcVal.getValueType() != MVT::i32) {
3238 SelectCode(N); // Emit default error
3239 return;
3240 }
3241
3242 SDValue CopyVal;
3243 Register SP = TLI->getStackPointerRegisterToSaveRestore();
3244 SDLoc SL(N);
3245
3246 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
3247 CopyVal = SrcVal.getOperand(0);
3248 } else {
3249 SDValue Log2WaveSize = CurDAG->getTargetConstant(
3250 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
3251
3252 if (N->isDivergent()) {
3253 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
3254 MVT::i32, SrcVal),
3255 0);
3256 }
3257
3258 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3259 {SrcVal, Log2WaveSize}),
3260 0);
3261 }
3262
3263 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
3264 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
3265}
3266
3267bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
3268 unsigned &Mods,
3269 bool IsCanonicalizing,
3270 bool AllowAbs) const {
3271 Mods = SISrcMods::NONE;
3272 Src = In;
3273
3274 if (Src.getOpcode() == ISD::FNEG) {
3275 Mods |= SISrcMods::NEG;
3276 Src = Src.getOperand(0);
3277 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
3278 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3279 // denormal mode, but we're implicitly canonicalizing in a source operand.
3280 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
3281 if (LHS && LHS->isZero()) {
3282 Mods |= SISrcMods::NEG;
3283 Src = Src.getOperand(1);
3284 }
3285 }
3286
3287 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
3288 Mods |= SISrcMods::ABS;
3289 Src = Src.getOperand(0);
3290 }
3291
3292 if (Mods != SISrcMods::NONE)
3293 return true;
3294
3295 // Convert various sign-bit masks on integers to src mods. Currently disabled
3296 // for 16-bit types as the codegen replaces the operand without adding a
3297 // srcmod. This is intentionally finding the cases where we are performing
3298 // float neg and abs on int types, the goal is not to obtain two's complement
3299 // neg or abs. Limit converison to select operands via the nonCanonalizing
3300 // pattern.
3301 // TODO: Add 16-bit support.
3302 if (IsCanonicalizing)
3303 return true;
3304
3305 unsigned Opc = Src->getOpcode();
3306 EVT VT = Src.getValueType();
3307 if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
3308 (VT != MVT::i32 && VT != MVT::i64))
3309 return true;
3310
3311 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Src->getOperand(1));
3312 if (!CRHS)
3313 return true;
3314
3315 // Recognise (xor a, 0x80000000) as NEG SrcMod.
3316 // Recognise (and a, 0x7fffffff) as ABS SrcMod.
3317 // Recognise (or a, 0x80000000) as NEG+ABS SrcModifiers.
3318 if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
3319 Mods |= SISrcMods::NEG;
3320 Src = Src.getOperand(0);
3321 } else if (Opc == ISD::AND && AllowAbs &&
3322 CRHS->getAPIntValue().isMaxSignedValue()) {
3323 Mods |= SISrcMods::ABS;
3324 Src = Src.getOperand(0);
3325 } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
3327 Src = Src.getOperand(0);
3328 }
3329
3330 return true;
3331}
3332
3333bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
3334 SDValue &SrcMods) const {
3335 unsigned Mods;
3336 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
3337 /*AllowAbs=*/true)) {
3338 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3339 return true;
3340 }
3341
3342 return false;
3343}
3344
3345bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
3346 SDValue In, SDValue &Src, SDValue &SrcMods) const {
3347 unsigned Mods;
3348 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
3349 /*AllowAbs=*/true)) {
3350 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3351 return true;
3352 }
3353
3354 return false;
3355}
3356
3357bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
3358 SDValue &SrcMods) const {
3359 unsigned Mods;
3360 if (SelectVOP3ModsImpl(In, Src, Mods,
3361 /*IsCanonicalizing=*/true,
3362 /*AllowAbs=*/false)) {
3363 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3364 return true;
3365 }
3366
3367 return false;
3368}
3369
3370bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
3371 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
3372 return false;
3373
3374 Src = In;
3375 return true;
3376}
3377
3378bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
3379 SDValue &SrcMods,
3380 bool OpSel) const {
3381 unsigned Mods;
3382 if (SelectVOP3ModsImpl(In, Src, Mods,
3383 /*IsCanonicalizing=*/true,
3384 /*AllowAbs=*/false)) {
3385 if (OpSel)
3386 Mods |= SISrcMods::OP_SEL_0;
3387 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3388 return true;
3389 }
3390
3391 return false;
3392}
3393
3394bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
3395 SDValue &SrcMods) const {
3396 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
3397}
3398
3399bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
3400 SDValue &SrcMods) const {
3401 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
3402}
3403
3404bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
3405 SDValue &SrcMods, SDValue &Clamp,
3406 SDValue &Omod) const {
3407 SDLoc DL(In);
3408 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3409 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3410
3411 return SelectVOP3Mods(In, Src, SrcMods);
3412}
3413
3414bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
3415 SDValue &SrcMods, SDValue &Clamp,
3416 SDValue &Omod) const {
3417 SDLoc DL(In);
3418 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3419 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3420
3421 return SelectVOP3BMods(In, Src, SrcMods);
3422}
3423
3424bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3425 SDValue &Clamp, SDValue &Omod) const {
3426 Src = In;
3427
3428 SDLoc DL(In);
3429 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3430 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3431
3432 return true;
3433}
3434
3435bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3436 SDValue &SrcMods, bool IsDOT) const {
3437 unsigned Mods = SISrcMods::NONE;
3438 Src = In;
3439
3440 // TODO: Handle G_FSUB 0 as fneg
3441 if (Src.getOpcode() == ISD::FNEG) {
3443 Src = Src.getOperand(0);
3444 }
3445
3446 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3447 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3448 unsigned VecMods = Mods;
3449
3450 SDValue Lo = stripBitcast(Src.getOperand(0));
3451 SDValue Hi = stripBitcast(Src.getOperand(1));
3452
3453 if (Lo.getOpcode() == ISD::FNEG) {
3454 Lo = stripBitcast(Lo.getOperand(0));
3455 Mods ^= SISrcMods::NEG;
3456 }
3457
3458 if (Hi.getOpcode() == ISD::FNEG) {
3459 Hi = stripBitcast(Hi.getOperand(0));
3460 Mods ^= SISrcMods::NEG_HI;
3461 }
3462
3463 if (isExtractHiElt(Lo, Lo))
3464 Mods |= SISrcMods::OP_SEL_0;
3465
3466 if (isExtractHiElt(Hi, Hi))
3467 Mods |= SISrcMods::OP_SEL_1;
3468
3469 unsigned VecSize = Src.getValueSizeInBits();
3470 Lo = stripExtractLoElt(Lo);
3471 Hi = stripExtractLoElt(Hi);
3472
3473 if (Lo.getValueSizeInBits() > VecSize) {
3474 Lo = CurDAG->getTargetExtractSubreg(
3475 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3476 MVT::getIntegerVT(VecSize), Lo);
3477 }
3478
3479 if (Hi.getValueSizeInBits() > VecSize) {
3480 Hi = CurDAG->getTargetExtractSubreg(
3481 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3482 MVT::getIntegerVT(VecSize), Hi);
3483 }
3484
3485 assert(Lo.getValueSizeInBits() <= VecSize &&
3486 Hi.getValueSizeInBits() <= VecSize);
3487
3488 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3489 // Really a scalar input. Just select from the low half of the register to
3490 // avoid packing.
3491
3492 if (VecSize == Lo.getValueSizeInBits()) {
3493 Src = Lo;
3494 } else if (VecSize == 32) {
3495 Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);
3496 } else {
3497 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3498
3499 SDLoc SL(In);
3501 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3502 Lo.getValueType()), 0);
3503 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3504 : AMDGPU::SReg_64RegClassID;
3505 const SDValue Ops[] = {
3506 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3507 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3508 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3509
3510 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3511 Src.getValueType(), Ops), 0);
3512 }
3513 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3514 return true;
3515 }
3516
3517 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3518 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3519 .bitcastToAPInt().getZExtValue();
3520 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3521 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3522 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3523 return true;
3524 }
3525 }
3526
3527 Mods = VecMods;
3528 } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3529 Src.getNumOperands() == 2) {
3530
3531 // TODO: We should repeat the build_vector source check above for the
3532 // vector_shuffle for negates and casts of individual elements.
3533
3534 auto *SVN = cast<ShuffleVectorSDNode>(Src);
3535 ArrayRef<int> Mask = SVN->getMask();
3536
3537 if (Mask[0] < 2 && Mask[1] < 2) {
3538 // src1 should be undef.
3539 SDValue ShuffleSrc = SVN->getOperand(0);
3540
3541 if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3542 ShuffleSrc = ShuffleSrc.getOperand(0);
3544 }
3545
3546 if (Mask[0] == 1)
3547 Mods |= SISrcMods::OP_SEL_0;
3548 if (Mask[1] == 1)
3549 Mods |= SISrcMods::OP_SEL_1;
3550
3551 Src = ShuffleSrc;
3552 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3553 return true;
3554 }
3555 }
3556
3557 // Packed instructions do not have abs modifiers.
3558 Mods |= SISrcMods::OP_SEL_1;
3559
3560 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3561 return true;
3562}
3563
3564bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3565 SDValue &SrcMods) const {
3566 return SelectVOP3PMods(In, Src, SrcMods, true);
3567}
3568
3569bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3570 SDValue &Src) const {
3571 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3572 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3573
3574 unsigned Mods = SISrcMods::OP_SEL_1;
3575 unsigned SrcVal = C->getZExtValue();
3576 if (SrcVal == 1)
3577 Mods |= SISrcMods::OP_SEL_0;
3578
3579 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3580 return true;
3581}
3582
3584 llvm::SelectionDAG *CurDAG,
3585 const SDLoc &DL) {
3586 unsigned DstRegClass;
3587 EVT DstTy;
3588 switch (Elts.size()) {
3589 case 8:
3590 DstRegClass = AMDGPU::VReg_256RegClassID;
3591 DstTy = MVT::v8i32;
3592 break;
3593 case 4:
3594 DstRegClass = AMDGPU::VReg_128RegClassID;
3595 DstTy = MVT::v4i32;
3596 break;
3597 case 2:
3598 DstRegClass = AMDGPU::VReg_64RegClassID;
3599 DstTy = MVT::v2i32;
3600 break;
3601 default:
3602 llvm_unreachable("unhandled Reg sequence size");
3603 }
3604
3606 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3607 for (unsigned i = 0; i < Elts.size(); ++i) {
3608 Ops.push_back(Elts[i]);
3609 Ops.push_back(CurDAG->getTargetConstant(
3611 }
3612 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3613}
3614
3616 llvm::SelectionDAG *CurDAG,
3617 const SDLoc &DL) {
3618 SmallVector<SDValue, 8> PackedElts;
3619 assert("unhandled Reg sequence size" &&
3620 (Elts.size() == 8 || Elts.size() == 16));
3621
3622 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3623 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3624 for (unsigned i = 0; i < Elts.size(); i += 2) {
3625 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3626 SDValue HiSrc;
3627 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3628 PackedElts.push_back(HiSrc);
3629 } else {
3630 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3631 MachineSDNode *Packed =
3632 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3633 {Elts[i + 1], Elts[i], PackLoLo});
3634 PackedElts.push_back(SDValue(Packed, 0));
3635 }
3636 }
3637
3638 return buildRegSequence32(PackedElts, CurDAG, DL);
3639}
3640
3642 llvm::SelectionDAG *CurDAG,
3643 const SDLoc &DL, unsigned ElementSize) {
3644 if (ElementSize == 16)
3645 return buildRegSequence16(Elts, CurDAG, DL);
3646 if (ElementSize == 32)
3647 return buildRegSequence32(Elts, CurDAG, DL);
3648 llvm_unreachable("Unhandled element size");
3649}
3650
3651static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3653 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3654 unsigned ElementSize) {
3655 if (ModOpcode == ISD::FNEG) {
3656 Mods |= SISrcMods::NEG;
3657 // Check if all elements also have abs modifier
3658 SmallVector<SDValue, 8> NegAbsElts;
3659 for (auto El : Elts) {
3660 if (El.getOpcode() != ISD::FABS)
3661 break;
3662 NegAbsElts.push_back(El->getOperand(0));
3663 }
3664 if (Elts.size() != NegAbsElts.size()) {
3665 // Neg
3666 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3667 } else {
3668 // Neg and Abs
3669 Mods |= SISrcMods::NEG_HI;
3670 Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3671 }
3672 } else {
3673 assert(ModOpcode == ISD::FABS);
3674 // Abs
3675 Mods |= SISrcMods::NEG_HI;
3676 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3677 }
3678}
3679
3680// Check all f16 elements for modifiers while looking through b32 and v2b16
3681// build vector, stop if element does not satisfy ModifierCheck.
3682static void
3684 std::function<bool(SDValue)> ModifierCheck) {
3685 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3686 if (auto *F16Pair =
3687 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3688 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3689 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3690 if (!ModifierCheck(ElF16))
3691 break;
3692 }
3693 }
3694 }
3695}
3696
3697bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3698 SDValue &SrcMods) const {
3699 Src = In;
3700 unsigned Mods = SISrcMods::OP_SEL_1;
3701
3702 // mods are on f16 elements
3703 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3705
3706 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3707 if (Element.getOpcode() != ISD::FNEG)
3708 return false;
3709 EltsF16.push_back(Element.getOperand(0));
3710 return true;
3711 });
3712
3713 // All elements have neg modifier
3714 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3715 Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3716 Mods |= SISrcMods::NEG;
3717 Mods |= SISrcMods::NEG_HI;
3718 }
3719 }
3720
3721 // mods are on v2f16 elements
3722 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3723 SmallVector<SDValue, 8> EltsV2F16;
3724 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3725 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3726 // Based on first element decide which mod we match, neg or abs
3727 if (ElV2f16.getOpcode() != ISD::FNEG)
3728 break;
3729 EltsV2F16.push_back(ElV2f16.getOperand(0));
3730 }
3731
3732 // All pairs of elements have neg modifier
3733 if (BV->getNumOperands() == EltsV2F16.size()) {
3734 Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3735 Mods |= SISrcMods::NEG;
3736 Mods |= SISrcMods::NEG_HI;
3737 }
3738 }
3739
3740 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3741 return true;
3742}
3743
3744bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3745 SDValue &SrcMods) const {
3746 Src = In;
3747 unsigned Mods = SISrcMods::OP_SEL_1;
3748 unsigned ModOpcode;
3749
3750 // mods are on f16 elements
3751 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3753 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3754 // Based on first element decide which mod we match, neg or abs
3755 if (EltsF16.empty())
3756 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3757 if (ElF16.getOpcode() != ModOpcode)
3758 return false;
3759 EltsF16.push_back(ElF16.getOperand(0));
3760 return true;
3761 });
3762
3763 // All elements have ModOpcode modifier
3764 if (BV->getNumOperands() * 2 == EltsF16.size())
3765 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3766 16);
3767 }
3768
3769 // mods are on v2f16 elements
3770 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3771 SmallVector<SDValue, 8> EltsV2F16;
3772
3773 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3774 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3775 // Based on first element decide which mod we match, neg or abs
3776 if (EltsV2F16.empty())
3777 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3778 if (ElV2f16->getOpcode() != ModOpcode)
3779 break;
3780 EltsV2F16.push_back(ElV2f16->getOperand(0));
3781 }
3782
3783 // All elements have ModOpcode modifier
3784 if (BV->getNumOperands() == EltsV2F16.size())
3785 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3786 32);
3787 }
3788
3789 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3790 return true;
3791}
3792
3793bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3794 SDValue &SrcMods) const {
3795 Src = In;
3796 unsigned Mods = SISrcMods::OP_SEL_1;
3798
3799 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3800 assert(BV->getNumOperands() > 0);
3801 // Based on first element decide which mod we match, neg or abs
3802 SDValue ElF32 = stripBitcast(BV->getOperand(0));
3803 unsigned ModOpcode =
3804 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3805 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3806 SDValue ElF32 = stripBitcast(BV->getOperand(i));
3807 if (ElF32.getOpcode() != ModOpcode)
3808 break;
3809 EltsF32.push_back(ElF32.getOperand(0));
3810 }
3811
3812 // All elements had ModOpcode modifier
3813 if (BV->getNumOperands() == EltsF32.size())
3814 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3815 32);
3816 }
3817
3818 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3819 return true;
3820}
3821
3822bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3823 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3824 BitVector UndefElements;
3825 if (SDValue Splat = BV->getSplatValue(&UndefElements))
3826 if (isInlineImmediate(Splat.getNode())) {
3827 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3828 unsigned Imm = C->getAPIntValue().getSExtValue();
3829 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3830 return true;
3831 }
3832 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3833 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3834 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3835 return true;
3836 }
3837 llvm_unreachable("unhandled Constant node");
3838 }
3839 }
3840
3841 // 16 bit splat
3842 SDValue SplatSrc32 = stripBitcast(In);
3843 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3844 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3845 SDValue SplatSrc16 = stripBitcast(Splat32);
3846 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3847 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3848 const SIInstrInfo *TII = Subtarget->getInstrInfo();
3849 std::optional<APInt> RawValue;
3850 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3851 RawValue = C->getValueAPF().bitcastToAPInt();
3852 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3853 RawValue = C->getAPIntValue();
3854
3855 if (RawValue.has_value()) {
3856 EVT VT = In.getValueType().getScalarType();
3857 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3858 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3861 RawValue.value());
3862 if (TII->isInlineConstant(FloatVal)) {
3863 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3864 MVT::i16);
3865 return true;
3866 }
3867 } else if (VT.getSimpleVT() == MVT::i16) {
3868 if (TII->isInlineConstant(RawValue.value())) {
3869 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3870 MVT::i16);
3871 return true;
3872 }
3873 } else
3874 llvm_unreachable("unknown 16-bit type");
3875 }
3876 }
3877 }
3878
3879 return false;
3880}
3881
3882bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3883 SDValue &IndexKey) const {
3884 unsigned Key = 0;
3885 Src = In;
3886
3887 if (In.getOpcode() == ISD::SRL) {
3888 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3889 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3890 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3891 ShiftAmt->getZExtValue() % 8 == 0) {
3892 Key = ShiftAmt->getZExtValue() / 8;
3893 Src = ShiftSrc;
3894 }
3895 }
3896
3897 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3898 return true;
3899}
3900
3901bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3902 SDValue &IndexKey) const {
3903 unsigned Key = 0;
3904 Src = In;
3905
3906 if (In.getOpcode() == ISD::SRL) {
3907 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3908 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3909 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3910 ShiftAmt->getZExtValue() == 16) {
3911 Key = 1;
3912 Src = ShiftSrc;
3913 }
3914 }
3915
3916 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3917 return true;
3918}
3919
3920bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src,
3921 SDValue &IndexKey) const {
3922 unsigned Key = 0;
3923 Src = In;
3924
3925 SDValue InI32;
3926
3927 if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) {
3928 const SDValue &ExtendSrc = In.getOperand(0);
3929 if (ExtendSrc.getValueSizeInBits() == 32)
3930 InI32 = ExtendSrc;
3931 } else if (In->getOpcode() == ISD::BITCAST) {
3932 const SDValue &CastSrc = In.getOperand(0);
3933 if (CastSrc.getOpcode() == ISD::BUILD_VECTOR &&
3934 CastSrc.getOperand(0).getValueSizeInBits() == 32) {
3935 ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(CastSrc.getOperand(1));
3936 if (Zero && Zero->getZExtValue() == 0)
3937 InI32 = CastSrc.getOperand(0);
3938 }
3939 }
3940
3941 if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
3942 const SDValue &ExtractVecEltSrc = InI32.getOperand(0);
3943 ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(InI32.getOperand(1));
3944 if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx &&
3945 EltIdx->getZExtValue() == 1) {
3946 Key = 1;
3947 Src = ExtractVecEltSrc;
3948 }
3949 }
3950
3951 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3952 return true;
3953}
3954
3955bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3956 SDValue &SrcMods) const {
3957 Src = In;
3958 // FIXME: Handle op_sel
3959 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3960 return true;
3961}
3962
3963bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3964 SDValue &SrcMods) const {
3965 // FIXME: Handle op_sel
3966 return SelectVOP3Mods(In, Src, SrcMods);
3967}
3968
3969// Match lowered fpext from bf16 to f32. This is a bit operation extending
3970// a 16-bit value with 16-bit of zeroes at LSB:
3971//
3972// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
3973// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
3974// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
3975static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
3976 if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
3977 return SDValue();
3978 Op = Op.getOperand(0);
3979
3980 IsExtractHigh = false;
3981 if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
3982 auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0));
3983 if (!Low16 || !Low16->isZero())
3984 return SDValue();
3985 Op = stripBitcast(Op.getOperand(1));
3986 if (Op.getValueType() != MVT::bf16)
3987 return SDValue();
3988 return Op;
3989 }
3990
3991 if (Op.getValueType() != MVT::i32)
3992 return SDValue();
3993
3994 if (Op.getOpcode() == ISD::AND) {
3995 if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3996 if (Mask->getZExtValue() == 0xffff0000) {
3997 IsExtractHigh = true;
3998 return Op.getOperand(0);
3999 }
4000 }
4001 return SDValue();
4002 }
4003
4004 if (Op.getOpcode() == ISD::SHL) {
4005 if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4006 if (Amt->getZExtValue() == 16)
4007 return Op.getOperand(0);
4008 }
4009 }
4010
4011 return SDValue();
4012}
4013
4014// The return value is not whether the match is possible (which it always is),
4015// but whether or not it a conversion is really used.
4016bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
4017 unsigned &Mods,
4018 MVT VT) const {
4019 Mods = 0;
4020 SelectVOP3ModsImpl(In, Src, Mods);
4021
4022 bool IsExtractHigh = false;
4023 if (Src.getOpcode() == ISD::FP_EXTEND) {
4024 Src = Src.getOperand(0);
4025 } else if (VT == MVT::bf16) {
4026 SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh);
4027 if (!B16)
4028 return false;
4029 Src = B16;
4030 } else
4031 return false;
4032
4033 if (Src.getValueType() != VT &&
4034 (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
4035 return false;
4036
4037 Src = stripBitcast(Src);
4038
4039 // Be careful about folding modifiers if we already have an abs. fneg is
4040 // applied last, so we don't want to apply an earlier fneg.
4041 if ((Mods & SISrcMods::ABS) == 0) {
4042 unsigned ModsTmp;
4043 SelectVOP3ModsImpl(Src, Src, ModsTmp);
4044
4045 if ((ModsTmp & SISrcMods::NEG) != 0)
4046 Mods ^= SISrcMods::NEG;
4047
4048 if ((ModsTmp & SISrcMods::ABS) != 0)
4049 Mods |= SISrcMods::ABS;
4050 }
4051
4052 // op_sel/op_sel_hi decide the source type and source.
4053 // If the source's op_sel_hi is set, it indicates to do a conversion from
4054 // fp16. If the sources's op_sel is set, it picks the high half of the source
4055 // register.
4056
4057 Mods |= SISrcMods::OP_SEL_1;
4058 if (IsExtractHigh ||
4059 (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) {
4060 Mods |= SISrcMods::OP_SEL_0;
4061
4062 // TODO: Should we try to look for neg/abs here?
4063 }
4064
4065 // Prevent unnecessary subreg COPY to VGPR_16
4066 if (Src.getOpcode() == ISD::TRUNCATE &&
4067 Src.getOperand(0).getValueType() == MVT::i32) {
4068 Src = Src.getOperand(0);
4069 }
4070 return true;
4071}
4072
4073bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
4074 SDValue &SrcMods) const {
4075 unsigned Mods = 0;
4076 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16))
4077 return false;
4078 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4079 return true;
4080}
4081
4082bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
4083 SDValue &SrcMods) const {
4084 unsigned Mods = 0;
4085 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
4086 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4087 return true;
4088}
4089
4090bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
4091 SDValue &SrcMods) const {
4092 unsigned Mods = 0;
4093 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16))
4094 return false;
4095 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4096 return true;
4097}
4098
4099bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
4100 SDValue &SrcMods) const {
4101 unsigned Mods = 0;
4102 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
4103 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4104 return true;
4105}
4106
4107// Match BITOP3 operation and return a number of matched instructions plus
4108// truth table.
4109static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
4111 unsigned NumOpcodes = 0;
4112 uint8_t LHSBits, RHSBits;
4113
4114 auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
4115 // Define truth table given Src0, Src1, Src2 bits permutations:
4116 // 0 0 0
4117 // 0 0 1
4118 // 0 1 0
4119 // 0 1 1
4120 // 1 0 0
4121 // 1 0 1
4122 // 1 1 0
4123 // 1 1 1
4124 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4125
4126 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
4127 if (C->isAllOnes()) {
4128 Bits = 0xff;
4129 return true;
4130 }
4131 if (C->isZero()) {
4132 Bits = 0;
4133 return true;
4134 }
4135 }
4136
4137 for (unsigned I = 0; I < Src.size(); ++I) {
4138 // Try to find existing reused operand
4139 if (Src[I] == Op) {
4140 Bits = SrcBits[I];
4141 return true;
4142 }
4143 // Try to replace parent operator
4144 if (Src[I] == In) {
4145 Bits = SrcBits[I];
4146 Src[I] = Op;
4147 return true;
4148 }
4149 }
4150
4151 if (Src.size() == 3) {
4152 // No room left for operands. Try one last time, there can be a 'not' of
4153 // one of our source operands. In this case we can compute the bits
4154 // without growing Src vector.
4155 if (Op.getOpcode() == ISD::XOR) {
4156 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4157 if (C->isAllOnes()) {
4158 SDValue LHS = Op.getOperand(0);
4159 for (unsigned I = 0; I < Src.size(); ++I) {
4160 if (Src[I] == LHS) {
4161 Bits = ~SrcBits[I];
4162 return true;
4163 }
4164 }
4165 }
4166 }
4167 }
4168
4169 return false;
4170 }
4171
4172 Bits = SrcBits[Src.size()];
4173 Src.push_back(Op);
4174 return true;
4175 };
4176
4177 switch (In.getOpcode()) {
4178 case ISD::AND:
4179 case ISD::OR:
4180 case ISD::XOR: {
4181 SDValue LHS = In.getOperand(0);
4182 SDValue RHS = In.getOperand(1);
4183
4184 SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
4185 if (!getOperandBits(LHS, LHSBits) ||
4186 !getOperandBits(RHS, RHSBits)) {
4187 Src = Backup;
4188 return std::make_pair(0, 0);
4189 }
4190
4191 // Recursion is naturally limited by the size of the operand vector.
4192 auto Op = BitOp3_Op(LHS, Src);
4193 if (Op.first) {
4194 NumOpcodes += Op.first;
4195 LHSBits = Op.second;
4196 }
4197
4198 Op = BitOp3_Op(RHS, Src);
4199 if (Op.first) {
4200 NumOpcodes += Op.first;
4201 RHSBits = Op.second;
4202 }
4203 break;
4204 }
4205 default:
4206 return std::make_pair(0, 0);
4207 }
4208
4209 uint8_t TTbl;
4210 switch (In.getOpcode()) {
4211 case ISD::AND:
4212 TTbl = LHSBits & RHSBits;
4213 break;
4214 case ISD::OR:
4215 TTbl = LHSBits | RHSBits;
4216 break;
4217 case ISD::XOR:
4218 TTbl = LHSBits ^ RHSBits;
4219 break;
4220 default:
4221 break;
4222 }
4223
4224 return std::make_pair(NumOpcodes + 1, TTbl);
4225}
4226
4227bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
4228 SDValue &Src2, SDValue &Tbl) const {
4230 uint8_t TTbl;
4231 unsigned NumOpcodes;
4232
4233 std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
4234
4235 // Src.empty() case can happen if all operands are all zero or all ones.
4236 // Normally it shall be optimized out before reaching this.
4237 if (NumOpcodes < 2 || Src.empty())
4238 return false;
4239
4240 // For a uniform case threshold should be higher to account for moves between
4241 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
4242 // and a readtfirstlane after.
4243 if (NumOpcodes < 4 && !In->isDivergent())
4244 return false;
4245
4246 if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
4247 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4248 // asm more readable. This cannot be modeled with AddedComplexity because
4249 // selector does not know how many operations did we match.
4250 if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
4251 (In.getOperand(0).getOpcode() == In.getOpcode() ||
4252 In.getOperand(1).getOpcode() == In.getOpcode()))
4253 return false;
4254
4255 if (In.getOpcode() == ISD::OR &&
4256 (In.getOperand(0).getOpcode() == ISD::AND ||
4257 In.getOperand(1).getOpcode() == ISD::AND))
4258 return false;
4259 }
4260
4261 // Last operand can be ignored, turning a ternary operation into a binary.
4262 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4263 // 'c' with 'a' here without changing the answer. In some pathological
4264 // cases it should be possible to get an operation with a single operand
4265 // too if optimizer would not catch it.
4266 while (Src.size() < 3)
4267 Src.push_back(Src[0]);
4268
4269 Src0 = Src[0];
4270 Src1 = Src[1];
4271 Src2 = Src[2];
4272
4273 Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
4274 return true;
4275}
4276
4277SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
4278 if (In.isUndef())
4279 return CurDAG->getUNDEF(MVT::i32);
4280
4281 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
4282 SDLoc SL(In);
4283 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
4284 }
4285
4286 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
4287 SDLoc SL(In);
4288 return CurDAG->getConstant(
4289 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
4290 }
4291
4292 SDValue Src;
4293 if (isExtractHiElt(In, Src))
4294 return Src;
4295
4296 return SDValue();
4297}
4298
4299bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
4300 assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
4301
4302 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
4303 const SIInstrInfo *SII = Subtarget->getInstrInfo();
4304
4305 unsigned Limit = 0;
4306 bool AllUsesAcceptSReg = true;
4307 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
4308 Limit < 10 && U != E; ++U, ++Limit) {
4309 const TargetRegisterClass *RC =
4310 getOperandRegClass(U->getUser(), U->getOperandNo());
4311
4312 // If the register class is unknown, it could be an unknown
4313 // register class that needs to be an SGPR, e.g. an inline asm
4314 // constraint
4315 if (!RC || SIRI->isSGPRClass(RC))
4316 return false;
4317
4318 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
4319 AllUsesAcceptSReg = false;
4320 SDNode *User = U->getUser();
4321 if (User->isMachineOpcode()) {
4322 unsigned Opc = User->getMachineOpcode();
4323 const MCInstrDesc &Desc = SII->get(Opc);
4324 if (Desc.isCommutable()) {
4325 unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
4326 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
4327 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
4328 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
4329 const TargetRegisterClass *CommutedRC =
4330 getOperandRegClass(U->getUser(), CommutedOpNo);
4331 if (CommutedRC == &AMDGPU::VS_32RegClass ||
4332 CommutedRC == &AMDGPU::VS_64RegClass)
4333 AllUsesAcceptSReg = true;
4334 }
4335 }
4336 }
4337 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
4338 // commuting current user. This means have at least one use
4339 // that strictly require VGPR. Thus, we will not attempt to commute
4340 // other user instructions.
4341 if (!AllUsesAcceptSReg)
4342 break;
4343 }
4344 }
4345 return !AllUsesAcceptSReg && (Limit < 10);
4346}
4347
4348bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
4349 const auto *Ld = cast<LoadSDNode>(N);
4350
4351 const MachineMemOperand *MMO = Ld->getMemOperand();
4352 if (N->isDivergent() && !AMDGPU::isUniformMMO(MMO))
4353 return false;
4354
4355 return MMO->getSize().hasValue() &&
4356 Ld->getAlign() >=
4357 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
4358 uint64_t(4))) &&
4359 ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4360 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
4361 (Subtarget->getScalarizeGlobalBehavior() &&
4362 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
4363 Ld->isSimple() &&
4364 static_cast<const SITargetLowering *>(getTargetLowering())
4365 ->isMemOpHasNoClobberedMemOperand(N)));
4366}
4367
4370 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
4371 bool IsModified = false;
4372 do {
4373 IsModified = false;
4374
4375 // Go over all selected nodes and try to fold them a bit more
4376 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
4377 while (Position != CurDAG->allnodes_end()) {
4378 SDNode *Node = &*Position++;
4380 if (!MachineNode)
4381 continue;
4382
4383 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
4384 if (ResNode != Node) {
4385 if (ResNode)
4386 ReplaceUses(Node, ResNode);
4387 IsModified = true;
4388 }
4389 }
4390 CurDAG->RemoveDeadNodes();
4391 } while (IsModified);
4392}
4393
4398
unsigned const MachineRegisterInfo * MRI
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static MachineSDNode * buildRegSequence32(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, const SelectionDAG *DAG)
static MemSDNode * findMemSDNode(SDNode *N)
static MachineSDNode * buildRegSequence16(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
#define LLVM_DEBUG(...)
Definition Debug.h:119
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
bool matchLoadD16FromBuildVector(SDNode *N) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
AMDGPUISelDAGToDAGPass(TargetMachine &TM)
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
static SDValue stripBitcast(SDValue Val)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:405
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:322
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const SIInstrInfo * getInstrInfo() const override
Generation getGeneration() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:570
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:597
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
Wrapper class representing virtual and physical registers.
Definition Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
SelectionDAGISelPass(std::unique_ptr< SelectionDAGISel > Selector)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
SDValue getTargetFrameIndex(int FI, EVT VT)
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
ilist< SDNode >::iterator allnodes_iterator
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static const unsigned CommuteAnyOperandIndex
Primary interface to the complete machine description for the target machine.
unsigned getID() const
Return the register class ID number.
Legacy analysis pass which computes a CycleInfo.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
bool isUniformMMO(const MachineMemOperand *MMO)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:289
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ TargetFrameIndex
Definition ISDOpcodes.h:182
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:299
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
bool isExtOpcode(unsigned Opcode)
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ Undef
Value of the register doesn't matter.
@ User
could "use" a pointer
constexpr const char32_t SBase
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:307
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:264
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:336
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static SDNode * packConstantV2I16(const SDNode *N, SelectionDAG &DAG)
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:851
#define N
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
Definition APFloat.cpp:265
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:368
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:380
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:311
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:251
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:331
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:294
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:340
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:138
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:122
static unsigned getSubRegFromChannel(unsigned Channel)
bool hasNoUnsignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.