39#include "llvm/IR/IntrinsicsAMDGPU.h"
40#include "llvm/IR/IntrinsicsR600.h"
50#define DEBUG_TYPE "si-lower"
56 cl::desc(
"Do not align and prefetch loops"),
60 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
61 cl::desc(
"Use indirect register addressing for divergent indexes"),
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 return AMDGPU::SGPR0 + Reg;
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
272 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
279 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
280 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
281 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
284 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
285 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
286 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
290 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
291 MVT::v3i16, MVT::v4i16, MVT::Other},
296 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
312 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
313 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
314 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
315 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
316 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
317 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
318 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
319 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
351 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
365 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
379 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
393 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
407 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
422 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
423 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
439 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
440 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
445 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
449 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
450 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
451 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
452 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
533 {MVT::f32, MVT::f64},
Legal);
626 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
627 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
628 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
764 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
768 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
772 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
773 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
792 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
795 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
796 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
797 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
800 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
808 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
824 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
844 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
845 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
846 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
847 MVT::v32f16, MVT::v32bf16},
863 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
865 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
877 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
878 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
883 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
884 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
885 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
886 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
890 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
891 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
892 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
893 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
1000 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1013 EVT DestVT,
EVT SrcVT)
const {
1023 LLT DestTy,
LLT SrcTy)
const {
1024 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
1025 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
1051 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1053 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1080 return (NumElts + 1) / 2;
1086 return NumElts * ((
Size + 31) / 32);
1095 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1104 if (ScalarVT == MVT::bf16) {
1105 RegisterVT = MVT::i32;
1106 IntermediateVT = MVT::v2bf16;
1108 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1109 IntermediateVT = RegisterVT;
1111 NumIntermediates = (NumElts + 1) / 2;
1112 return NumIntermediates;
1117 IntermediateVT = RegisterVT;
1118 NumIntermediates = NumElts;
1119 return NumIntermediates;
1122 if (Size < 16 && Subtarget->has16BitInsts()) {
1124 RegisterVT = MVT::i16;
1125 IntermediateVT = ScalarVT;
1126 NumIntermediates = NumElts;
1127 return NumIntermediates;
1131 RegisterVT = MVT::i32;
1132 IntermediateVT = ScalarVT;
1133 NumIntermediates = NumElts;
1134 return NumIntermediates;
1138 RegisterVT = MVT::i32;
1139 IntermediateVT = RegisterVT;
1140 NumIntermediates = NumElts * ((
Size + 31) / 32);
1141 return NumIntermediates;
1146 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1151 unsigned MaxNumLanes) {
1152 assert(MaxNumLanes != 0);
1155 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1156 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1167 unsigned MaxNumLanes) {
1168 auto *ST = dyn_cast<StructType>(Ty);
1173 assert(ST->getNumContainedTypes() == 2 &&
1174 ST->getContainedType(1)->isIntegerTy(32));
1189 DL.getPointerSizeInBits(AS) == 192)
1199 DL.getPointerSizeInBits(AS) == 160) ||
1201 DL.getPointerSizeInBits(AS) == 192))
1209 unsigned IntrID)
const {
1211 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1229 if (RsrcIntr->IsImage) {
1237 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1244 Info.ptrVal = RsrcArg;
1247 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1256 if (RsrcIntr->IsImage) {
1257 unsigned MaxNumLanes = 4;
1272 std::numeric_limits<unsigned>::max());
1282 if (RsrcIntr->IsImage) {
1283 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1303 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1305 Info.memVT = MVT::i32;
1312 case Intrinsic::amdgcn_raw_buffer_load_lds:
1313 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1314 case Intrinsic::amdgcn_struct_buffer_load_lds:
1315 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1316 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1321 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1322 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1323 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1324 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1327 std::numeric_limits<unsigned>::max());
1337 case Intrinsic::amdgcn_ds_ordered_add:
1338 case Intrinsic::amdgcn_ds_ordered_swap: {
1351 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1352 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1355 Info.ptrVal =
nullptr;
1360 case Intrinsic::amdgcn_ds_append:
1361 case Intrinsic::amdgcn_ds_consume: {
1374 case Intrinsic::amdgcn_global_atomic_csub: {
1383 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1393 case Intrinsic::amdgcn_global_atomic_fmin_num:
1394 case Intrinsic::amdgcn_global_atomic_fmax_num:
1395 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1396 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1397 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1398 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1408 case Intrinsic::amdgcn_global_load_tr_b64:
1409 case Intrinsic::amdgcn_global_load_tr_b128:
1410 case Intrinsic::amdgcn_ds_read_tr4_b64:
1411 case Intrinsic::amdgcn_ds_read_tr6_b96:
1412 case Intrinsic::amdgcn_ds_read_tr8_b64:
1413 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1421 case Intrinsic::amdgcn_ds_gws_init:
1422 case Intrinsic::amdgcn_ds_gws_barrier:
1423 case Intrinsic::amdgcn_ds_gws_sema_v:
1424 case Intrinsic::amdgcn_ds_gws_sema_br:
1425 case Intrinsic::amdgcn_ds_gws_sema_p:
1426 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1436 Info.memVT = MVT::i32;
1440 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1446 case Intrinsic::amdgcn_global_load_lds: {
1448 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1454 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1464 Info.memVT = MVT::i32;
1471 case Intrinsic::amdgcn_s_prefetch_data: {
1486 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1489 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1490 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1502 Type *&AccessTy)
const {
1504 switch (
II->getIntrinsicID()) {
1505 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1506 case Intrinsic::amdgcn_ds_append:
1507 case Intrinsic::amdgcn_ds_consume:
1508 case Intrinsic::amdgcn_ds_read_tr4_b64:
1509 case Intrinsic::amdgcn_ds_read_tr6_b96:
1510 case Intrinsic::amdgcn_ds_read_tr8_b64:
1511 case Intrinsic::amdgcn_ds_read_tr16_b64:
1512 case Intrinsic::amdgcn_ds_ordered_add:
1513 case Intrinsic::amdgcn_ds_ordered_swap:
1514 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1515 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1516 case Intrinsic::amdgcn_global_atomic_csub:
1517 case Intrinsic::amdgcn_global_atomic_fmax_num:
1518 case Intrinsic::amdgcn_global_atomic_fmin_num:
1519 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1520 case Intrinsic::amdgcn_global_load_tr_b64:
1521 case Intrinsic::amdgcn_global_load_tr_b128:
1522 Ptr =
II->getArgOperand(0);
1524 case Intrinsic::amdgcn_global_load_lds:
1525 Ptr =
II->getArgOperand(1);
1530 AccessTy =
II->getType();
1536 unsigned AddrSpace)
const {
1548 return AM.
Scale == 0 &&
1550 AM.
BaseOffs, AddrSpace, FlatVariant));
1570 return isLegalMUBUFAddressingMode(AM);
1573bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1584 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1596 if (AM.HasBaseReg) {
1628 return isLegalMUBUFAddressingMode(AM);
1635 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1685 : isLegalMUBUFAddressingMode(AM);
1732 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1744 Align RequiredAlignment(
1747 Alignment < RequiredAlignment)
1768 RequiredAlignment =
Align(4);
1786 *IsFast = (Alignment >= RequiredAlignment) ? 64
1787 : (Alignment <
Align(4)) ? 32
1809 *IsFast = (Alignment >= RequiredAlignment) ? 96
1810 : (Alignment <
Align(4)) ? 32
1823 RequiredAlignment =
Align(8);
1834 *IsFast = (Alignment >= RequiredAlignment) ? 128
1835 : (Alignment <
Align(4)) ? 32
1852 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1854 return Alignment >= RequiredAlignment ||
1863 bool AlignedBy4 = Alignment >=
Align(4);
1865 *IsFast = AlignedBy4;
1876 return Alignment >=
Align(4) ||
1890 return Size >= 32 && Alignment >=
Align(4);
1895 unsigned *IsFast)
const {
1897 Alignment, Flags, IsFast);
1907 if (
Op.size() >= 16 &&
1911 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1919 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1929 unsigned DestAS)
const {
1937 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1955 unsigned Index)
const {
1998 auto [InputPtrReg, RC, ArgTy] =
2008 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2014 const SDLoc &SL)
const {
2021 const SDLoc &SL)
const {
2024 std::optional<uint32_t> KnownSize =
2026 if (KnownSize.has_value())
2052 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2061SDValue SITargetLowering::lowerKernargMemParameter(
2073 int64_t OffsetDiff =
Offset - AlignDownOffset;
2079 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2089 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2099 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2147 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2152SDValue SITargetLowering::getPreloadedValue(
2174 Reg = &WorkGroupIDX;
2175 RC = &AMDGPU::SReg_32RegClass;
2179 Reg = &WorkGroupIDY;
2180 RC = &AMDGPU::SReg_32RegClass;
2184 Reg = &WorkGroupIDZ;
2185 RC = &AMDGPU::SReg_32RegClass;
2216 for (
unsigned I = 0, E = Ins.size(), PSInputNum = 0;
I != E; ++
I) {
2220 "vector type argument should have been split");
2225 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2233 "unexpected vector split in ps argument type");
2247 Info->markPSInputAllocated(PSInputNum);
2249 Info->markPSInputEnabled(PSInputNum);
2265 if (
Info.hasWorkItemIDX()) {
2275 if (
Info.hasWorkItemIDY()) {
2278 Info.setWorkItemIDY(
2281 unsigned Reg = AMDGPU::VGPR1;
2289 if (
Info.hasWorkItemIDZ()) {
2292 Info.setWorkItemIDZ(
2295 unsigned Reg = AMDGPU::VGPR2;
2315 if (RegIdx == ArgVGPRs.
size()) {
2322 unsigned Reg = ArgVGPRs[RegIdx];
2324 assert(Reg != AMDGPU::NoRegister);
2334 unsigned NumArgRegs) {
2337 if (RegIdx == ArgSGPRs.
size())
2340 unsigned Reg = ArgSGPRs[RegIdx];
2342 assert(Reg != AMDGPU::NoRegister);
2356 assert(Reg != AMDGPU::NoRegister);
2382 const unsigned Mask = 0x3ff;
2385 if (
Info.hasWorkItemIDX()) {
2387 Info.setWorkItemIDX(Arg);
2390 if (
Info.hasWorkItemIDY()) {
2392 Info.setWorkItemIDY(Arg);
2395 if (
Info.hasWorkItemIDZ())
2407 const unsigned Mask = 0x3ff;
2428 if (
Info.hasImplicitArgPtr())
2436 if (
Info.hasWorkGroupIDX())
2439 if (
Info.hasWorkGroupIDY())
2442 if (
Info.hasWorkGroupIDZ())
2445 if (
Info.hasLDSKernelId())
2457 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2464 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2470 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2476 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2491 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2497 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2503 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2520 bool InPreloadSequence =
true;
2522 bool AlignedForImplictArgs =
false;
2523 unsigned ImplicitArgOffset = 0;
2524 for (
auto &Arg :
F.args()) {
2525 if (!InPreloadSequence || !Arg.hasInRegAttr())
2528 unsigned ArgIdx = Arg.getArgNo();
2531 if (InIdx < Ins.size() &&
2532 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2535 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2536 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2538 assert(ArgLocs[ArgIdx].isMemLoc());
2539 auto &ArgLoc = ArgLocs[InIdx];
2541 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2543 unsigned NumAllocSGPRs =
2544 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2547 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2548 if (!AlignedForImplictArgs) {
2550 alignTo(LastExplicitArgOffset,
2552 LastExplicitArgOffset;
2553 AlignedForImplictArgs =
true;
2555 ArgOffset += ImplicitArgOffset;
2559 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2560 assert(InIdx >= 1 &&
"No previous SGPR");
2561 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2562 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2566 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2567 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2570 InPreloadSequence =
false;
2576 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2578 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2580 if (PreloadRegs->
size() > 1)
2581 RC = &AMDGPU::SGPR_32RegClass;
2582 for (
auto &Reg : *PreloadRegs) {
2588 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2597 if (
Info.hasLDSKernelId()) {
2599 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2608 bool IsShader)
const {
2616 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2618 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2622 unsigned NumRequiredSystemSGPRs =
2623 Info.hasWorkGroupIDX() +
Info.hasWorkGroupIDY() +
2624 Info.hasWorkGroupIDZ() +
Info.hasWorkGroupInfo();
2625 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2627 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2632 if (!HasArchitectedSGPRs) {
2633 if (
Info.hasWorkGroupIDX()) {
2635 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2639 if (
Info.hasWorkGroupIDY()) {
2641 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2645 if (
Info.hasWorkGroupIDZ()) {
2647 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2652 if (
Info.hasWorkGroupInfo()) {
2654 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2658 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2660 unsigned PrivateSegmentWaveByteOffsetReg;
2663 PrivateSegmentWaveByteOffsetReg =
2664 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2668 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2670 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2673 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2675 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2676 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2680 Info.getNumPreloadedSGPRs() >= 16);
2695 if (HasStackObjects)
2696 Info.setHasNonSpillStackObjects(
true);
2701 HasStackObjects =
true;
2705 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2707 if (!ST.enableFlatScratch()) {
2708 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2715 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2717 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2727 Info.setScratchRSrcReg(ReservedBufferReg);
2746 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2747 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2754 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2755 if (!
MRI.isLiveIn(Reg)) {
2756 Info.setStackPtrOffsetReg(Reg);
2761 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2768 if (ST.getFrameLowering()->hasFP(MF)) {
2769 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2775 return !
Info->isEntryFunction();
2785 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2794 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2795 RC = &AMDGPU::SGPR_64RegClass;
2796 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2797 RC = &AMDGPU::SGPR_32RegClass;
2803 Entry->addLiveIn(*
I);
2808 for (
auto *Exit : Exits)
2810 TII->get(TargetOpcode::COPY), *
I)
2828 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2847 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2848 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2856 !
Info->hasWorkGroupIDZ());
2875 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2876 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2879 Info->markPSInputAllocated(0);
2880 Info->markPSInputEnabled(0);
2891 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2892 if ((PsInputBits & 0x7F) == 0 ||
2893 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2896 }
else if (IsKernel) {
2899 Splits.
append(Ins.begin(), Ins.end());
2912 }
else if (!IsGraphics) {
2937 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2947 if (IsEntryFunc && VA.
isMemLoc()) {
2970 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
2974 int64_t OffsetDiff =
Offset - AlignDownOffset;
2981 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2992 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
2993 Ins[i].Flags.isSExt(), &Ins[i]);
3001 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3004 if (PreloadRegs.
size() == 1) {
3005 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3010 TRI->getRegSizeInBits(*RC)));
3018 for (
auto Reg : PreloadRegs) {
3025 PreloadRegs.size()),
3042 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3043 Ins[i].Flags.isSExt(), &Ins[i]);
3055 "hidden argument in kernel signature was not preloaded",
3062 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3063 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3068 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
3083 if (!IsEntryFunc && VA.
isMemLoc()) {
3084 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3095 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3096 RC = &AMDGPU::VGPR_32RegClass;
3097 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3098 RC = &AMDGPU::SGPR_32RegClass;
3158 Info->setBytesInStackArgArea(StackArgSize);
3160 return Chains.
empty() ? Chain
3177 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3183 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3184 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3185 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3208 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
3226 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3227 ++
I, ++RealRVLocIdx) {
3231 SDValue Arg = OutVals[RealRVLocIdx];
3259 if (!
Info->isEntryFunction()) {
3265 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3267 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3283 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
3366 auto &ArgUsageInfo =
3368 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3394 const auto [OutgoingArg, ArgRC, ArgTy] =
3399 const auto [IncomingArg, IncomingArgRC, Ty] =
3401 assert(IncomingArgRC == ArgRC);
3404 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3412 InputReg = getImplicitArgPtr(DAG,
DL);
3414 std::optional<uint32_t> Id =
3416 if (Id.has_value()) {
3427 if (OutgoingArg->isRegister()) {
3428 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3429 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3432 unsigned SpecialArgOffset =
3443 auto [OutgoingArg, ArgRC, Ty] =
3446 std::tie(OutgoingArg, ArgRC, Ty) =
3449 std::tie(OutgoingArg, ArgRC, Ty) =
3464 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3465 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3466 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3498 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3499 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3510 : IncomingArgY ? *IncomingArgY
3517 if (OutgoingArg->isRegister()) {
3519 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3560 if (Callee->isDivergent())
3567 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3571 if (!CallerPreserved)
3574 bool CCMatch = CallerCC == CalleeCC;
3587 if (Arg.hasByValAttr())
3601 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3602 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3611 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3624 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
3626 if (!CCVA.isRegLoc())
3631 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3633 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
3662 if (IsChainCallConv) {
3666 RequestedExec = CLI.
Args.back();
3667 assert(RequestedExec.
Node &&
"No node for EXEC");
3672 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Unexpected last arg");
3673 CLI.
Outs.pop_back();
3677 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Exec wasn't split up");
3678 CLI.
Outs.pop_back();
3683 "Haven't popped all the pieces of the EXEC mask");
3694 bool IsSibCall =
false;
3708 "unsupported call to variadic function ");
3716 "unsupported required tail call to function ");
3721 Outs, OutVals, Ins, DAG);
3725 "site marked musttail or on llvm.amdgcn.cs.chain");
3732 if (!TailCallOpt && IsTailCall)
3778 if (!IsSibCall || IsChainCallConv) {
3785 RegsToPass.emplace_back(IsChainCallConv
3786 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3787 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3794 const unsigned NumSpecialInputs = RegsToPass.size();
3796 MVT PtrVT = MVT::i32;
3799 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3827 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
3835 int32_t
Offset = LocMemOffset;
3842 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3848 ? Flags.getNonZeroByValAlign()
3875 if (Outs[i].Flags.isByVal()) {
3877 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
3880 Outs[i].Flags.getNonZeroByValAlign(),
3882 nullptr, std::nullopt, DstInfo,
3888 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
3894 if (!MemOpChains.
empty())
3910 unsigned ArgIdx = 0;
3911 for (
auto [Reg, Val] : RegsToPass) {
3912 if (ArgIdx++ >= NumSpecialInputs &&
3913 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
3939 if (IsTailCall && !IsSibCall) {
3944 std::vector<SDValue> Ops({Chain});
3950 Ops.push_back(Callee);
3967 Ops.push_back(Callee);
3978 if (IsChainCallConv)
3979 Ops.push_back(RequestedExec.
Node);
3983 for (
auto &[Reg, Val] : RegsToPass)
3987 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3988 assert(Mask &&
"Missing call preserved mask for calling convention");
3998 MVT::Glue, GlueOps),
4003 Ops.push_back(InGlue);
4020 return DAG.
getNode(OPC,
DL, MVT::Other, Ops);
4025 Chain = Call.getValue(0);
4026 InGlue = Call.getValue(1);
4028 uint64_t CalleePopBytes = NumBytes;
4049 EVT VT =
Op.getValueType();
4059 Align Alignment = cast<ConstantSDNode>(
Op.getOperand(2))->getAlignValue();
4063 "Stack grows upwards for AMDGPU");
4065 Chain = BaseAddr.getValue(1);
4067 if (Alignment > StackAlign) {
4070 uint64_t StackAlignMask = ScaledAlignment - 1;
4077 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4079 if (isa<ConstantSDNode>(
Size)) {
4110 if (
Op.getValueType() != MVT::i32)
4129 assert(
Op.getValueType() == MVT::i32);
4138 Op.getOperand(0), IntrinID, GetRoundBothImm);
4172 SDValue RoundModeTimesNumBits =
4192 TableEntry, EnumOffset);
4206 if (
auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4208 static_cast<uint32_t>(ConstMode->getZExtValue()),
4220 if (UseReducedTable) {
4226 SDValue RoundModeTimesNumBits =
4246 SDValue RoundModeTimesNumBits =
4255 NewMode = TruncTable;
4264 ReadFirstLaneID, NewMode);
4277 IntrinID, RoundBothImm, NewMode);
4283 if (
Op->isDivergent())
4302 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4303 EVT SrcVT = Src.getValueType();
4312 EVT DstVT =
Op.getValueType();
4321 if (
Op.getValueType() != MVT::i64)
4335 Op.getOperand(0), IntrinID, ModeHwRegImm);
4337 Op.getOperand(0), IntrinID, TrapHwRegImm);
4351 if (
Op.getOperand(1).getValueType() != MVT::i64)
4363 ReadFirstLaneID, NewModeReg);
4365 ReadFirstLaneID, NewTrapReg);
4367 unsigned ModeHwReg =
4370 unsigned TrapHwReg =
4378 IntrinID, ModeHwRegImm, NewModeReg);
4381 IntrinID, TrapHwRegImm, NewTrapReg);
4388 .
Case(
"m0", AMDGPU::M0)
4389 .
Case(
"exec", AMDGPU::EXEC)
4390 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4391 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4392 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4393 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4394 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4397 if (Reg == AMDGPU::NoRegister) {
4405 "\" for subtarget."));
4410 case AMDGPU::EXEC_LO:
4411 case AMDGPU::EXEC_HI:
4412 case AMDGPU::FLAT_SCR_LO:
4413 case AMDGPU::FLAT_SCR_HI:
4418 case AMDGPU::FLAT_SCR:
4437 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4446static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4468 auto Next = std::next(
I);
4481 return std::pair(LoopBB, RemainderBB);
4488 auto I =
MI.getIterator();
4489 auto E = std::next(
I);
4511 Src->setIsKill(
false);
4521 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
4527 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4530 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4554 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4555 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4564 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4565 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4566 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4567 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4575 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4582 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4586 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4592 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4593 : AMDGPU::S_AND_SAVEEXEC_B64),
4597 MRI.setSimpleHint(NewExec, CondReg);
4599 if (UseGPRIdxMode) {
4601 SGPRIdxReg = CurrentIdxReg;
4603 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4604 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4611 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4614 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4621 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4624 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4625 : AMDGPU::S_XOR_B64_term),
4649 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4650 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4658 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
4660 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4661 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4662 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4663 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4678 InitResultReg, DstReg, PhiReg, TmpExec,
4679 Offset, UseGPRIdxMode, SGPRIdxReg);
4685 LoopBB->removeSuccessor(RemainderBB);
4687 LoopBB->addSuccessor(LandingPad);
4698static std::pair<unsigned, int>
4702 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
4707 return std::pair(AMDGPU::sub0,
Offset);
4721 assert(
Idx->getReg() != AMDGPU::NoRegister);
4745 return Idx->getReg();
4747 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4764 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
4765 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4774 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4777 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4781 if (UseGPRIdxMode) {
4788 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4801 MI.eraseFromParent();
4810 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4811 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4817 UseGPRIdxMode, SGPRIdxReg);
4821 if (UseGPRIdxMode) {
4823 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4825 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4830 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4835 MI.eraseFromParent();
4852 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4862 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4864 if (
Idx->getReg() == AMDGPU::NoRegister) {
4875 MI.eraseFromParent();
4880 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4884 if (UseGPRIdxMode) {
4888 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4897 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4898 TRI.getRegSizeInBits(*VecRC), 32,
false);
4904 MI.eraseFromParent();
4914 Register PhiReg =
MRI.createVirtualRegister(VecRC);
4918 UseGPRIdxMode, SGPRIdxReg);
4921 if (UseGPRIdxMode) {
4923 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4925 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4931 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4932 TRI.getRegSizeInBits(*VecRC), 32,
false);
4933 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
4939 MI.eraseFromParent();
4954 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
4985 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
4986 Register InitalValReg =
MRI.createVirtualRegister(DstRegClass);
4988 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
4989 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4990 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4992 Register FF1Reg =
MRI.createVirtualRegister(DstRegClass);
4993 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
4995 bool IsWave32 = ST.isWave32();
4996 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4997 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5002 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
5005 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5013 I = ComputeLoop->end();
5015 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5019 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5020 .
addReg(TmpSReg->getOperand(0).getReg())
5024 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5025 auto FF1 =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(SFFOpc), FF1Reg)
5026 .
addReg(ActiveBits->getOperand(0).getReg());
5027 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5028 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5030 .
addReg(FF1->getOperand(0).getReg());
5031 auto NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(Opc), DstReg)
5033 .
addReg(LaneValue->getOperand(0).getReg());
5036 unsigned BITSETOpc =
5037 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5038 auto NewActiveBits =
5039 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5040 .
addReg(FF1->getOperand(0).getReg())
5041 .
addReg(ActiveBits->getOperand(0).getReg());
5044 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5045 .addMBB(ComputeLoop);
5046 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5047 .addMBB(ComputeLoop);
5050 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5052 .
addReg(NewActiveBits->getOperand(0).getReg())
5054 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5059 MI.eraseFromParent();
5071 switch (
MI.getOpcode()) {
5072 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5074 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5076 case AMDGPU::S_UADDO_PSEUDO:
5077 case AMDGPU::S_USUBO_PSEUDO: {
5084 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5086 : AMDGPU::S_SUB_I32;
5097 MI.eraseFromParent();
5100 case AMDGPU::S_ADD_U64_PSEUDO:
5101 case AMDGPU::S_SUB_U64_PSEUDO: {
5110 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5112 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5122 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5123 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5126 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5128 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5131 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5133 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5135 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5136 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5149 MI.eraseFromParent();
5152 case AMDGPU::V_ADD_U64_PSEUDO:
5153 case AMDGPU::V_SUB_U64_PSEUDO: {
5159 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5165 if (IsAdd && ST.hasLshlAddB64()) {
5171 TII->legalizeOperands(*
Add);
5172 MI.eraseFromParent();
5176 const auto *CarryRC =
TRI->getWaveMaskRegClass();
5178 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5179 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5181 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
5182 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
5186 : &AMDGPU::VReg_64RegClass;
5189 : &AMDGPU::VReg_64RegClass;
5192 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5194 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5197 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5199 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5202 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5204 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5207 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5214 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5228 TII->legalizeOperands(*LoHalf);
5229 TII->legalizeOperands(*HiHalf);
5230 MI.eraseFromParent();
5233 case AMDGPU::S_ADD_CO_PSEUDO:
5234 case AMDGPU::S_SUB_CO_PSEUDO: {
5248 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5249 ? AMDGPU::S_ADDC_U32
5250 : AMDGPU::S_SUBB_U32;
5252 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5253 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5258 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5259 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5263 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5265 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5271 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
5272 assert(WaveSize == 64 || WaveSize == 32);
5274 if (WaveSize == 64) {
5275 if (ST.hasScalarCompareEq64()) {
5281 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5283 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5285 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5286 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5288 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
5309 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5315 MI.eraseFromParent();
5318 case AMDGPU::SI_INIT_M0: {
5320 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5321 .
add(
MI.getOperand(0));
5322 MI.eraseFromParent();
5325 case AMDGPU::GET_GROUPSTATICSIZE: {
5330 .
add(
MI.getOperand(0))
5332 MI.eraseFromParent();
5335 case AMDGPU::GET_SHADERCYCLESHILO: {
5349 using namespace AMDGPU::Hwreg;
5350 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5352 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5353 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5355 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5356 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5358 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5362 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5367 .
add(
MI.getOperand(0))
5372 MI.eraseFromParent();
5375 case AMDGPU::SI_INDIRECT_SRC_V1:
5376 case AMDGPU::SI_INDIRECT_SRC_V2:
5377 case AMDGPU::SI_INDIRECT_SRC_V4:
5378 case AMDGPU::SI_INDIRECT_SRC_V8:
5379 case AMDGPU::SI_INDIRECT_SRC_V9:
5380 case AMDGPU::SI_INDIRECT_SRC_V10:
5381 case AMDGPU::SI_INDIRECT_SRC_V11:
5382 case AMDGPU::SI_INDIRECT_SRC_V12:
5383 case AMDGPU::SI_INDIRECT_SRC_V16:
5384 case AMDGPU::SI_INDIRECT_SRC_V32:
5386 case AMDGPU::SI_INDIRECT_DST_V1:
5387 case AMDGPU::SI_INDIRECT_DST_V2:
5388 case AMDGPU::SI_INDIRECT_DST_V4:
5389 case AMDGPU::SI_INDIRECT_DST_V8:
5390 case AMDGPU::SI_INDIRECT_DST_V9:
5391 case AMDGPU::SI_INDIRECT_DST_V10:
5392 case AMDGPU::SI_INDIRECT_DST_V11:
5393 case AMDGPU::SI_INDIRECT_DST_V12:
5394 case AMDGPU::SI_INDIRECT_DST_V16:
5395 case AMDGPU::SI_INDIRECT_DST_V32:
5397 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5398 case AMDGPU::SI_KILL_I1_PSEUDO:
5400 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5409 Register SrcCond =
MI.getOperand(3).getReg();
5411 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5412 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5413 const auto *CondRC =
TRI->getWaveMaskRegClass();
5414 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
5418 : &AMDGPU::VReg_64RegClass;
5421 : &AMDGPU::VReg_64RegClass;
5424 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5426 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5429 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5431 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5434 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5436 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5457 MI.eraseFromParent();
5460 case AMDGPU::SI_BR_UNDEF: {
5464 .
add(
MI.getOperand(0));
5466 MI.eraseFromParent();
5469 case AMDGPU::ADJCALLSTACKUP:
5470 case AMDGPU::ADJCALLSTACKDOWN: {
5477 case AMDGPU::SI_CALL_ISEL: {
5481 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
5484 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5490 MI.eraseFromParent();
5493 case AMDGPU::V_ADD_CO_U32_e32:
5494 case AMDGPU::V_SUB_CO_U32_e32:
5495 case AMDGPU::V_SUBREV_CO_U32_e32: {
5498 unsigned Opc =
MI.getOpcode();
5500 bool NeedClampOperand =
false;
5501 if (
TII->pseudoToMCOpcode(Opc) == -1) {
5503 NeedClampOperand =
true;
5507 if (
TII->isVOP3(*
I)) {
5512 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
5513 if (NeedClampOperand)
5516 TII->legalizeOperands(*
I);
5518 MI.eraseFromParent();
5521 case AMDGPU::V_ADDC_U32_e32:
5522 case AMDGPU::V_SUBB_U32_e32:
5523 case AMDGPU::V_SUBBREV_U32_e32:
5526 TII->legalizeOperands(
MI);
5528 case AMDGPU::DS_GWS_INIT:
5529 case AMDGPU::DS_GWS_SEMA_BR:
5530 case AMDGPU::DS_GWS_BARRIER:
5531 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
5533 case AMDGPU::DS_GWS_SEMA_V:
5534 case AMDGPU::DS_GWS_SEMA_P:
5535 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5543 case AMDGPU::S_SETREG_B32: {
5558 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5559 const unsigned SetMask = WidthMask <<
Offset;
5562 unsigned SetDenormOp = 0;
5563 unsigned SetRoundOp = 0;
5571 SetRoundOp = AMDGPU::S_ROUND_MODE;
5572 SetDenormOp = AMDGPU::S_DENORM_MODE;
5574 SetRoundOp = AMDGPU::S_ROUND_MODE;
5576 SetDenormOp = AMDGPU::S_DENORM_MODE;
5579 if (SetRoundOp || SetDenormOp) {
5582 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5583 unsigned ImmVal = Def->getOperand(1).getImm();
5597 MI.eraseFromParent();
5606 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
5610 case AMDGPU::S_INVERSE_BALLOT_U32:
5611 case AMDGPU::S_INVERSE_BALLOT_U64:
5614 MI.setDesc(
TII->get(AMDGPU::COPY));
5616 case AMDGPU::ENDPGM_TRAP: {
5619 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
5639 MI.eraseFromParent();
5642 case AMDGPU::SIMULATED_TRAP: {
5646 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
5647 MI.eraseFromParent();
5684 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5800 EVT VT =
N->getValueType(0);
5804 if (VT == MVT::f16) {
5820 unsigned Opc =
Op.getOpcode();
5821 EVT VT =
Op.getValueType();
5822 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5823 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5824 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5825 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5840 unsigned Opc =
Op.getOpcode();
5841 EVT VT =
Op.getValueType();
5842 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5843 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5844 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5845 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5853 DAG.
getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
5855 DAG.
getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
5862 unsigned Opc =
Op.getOpcode();
5863 EVT VT =
Op.getValueType();
5864 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5865 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5866 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5867 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5868 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5869 VT == MVT::v32bf16);
5874 : std::pair(Op0, Op0);
5883 DAG.
getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
5885 DAG.
getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
5891 switch (
Op.getOpcode()) {
5895 return LowerBRCOND(
Op, DAG);
5897 return LowerRETURNADDR(
Op, DAG);
5900 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
5901 "Load should return a value and a chain");
5905 EVT VT =
Op.getValueType();
5907 return lowerFSQRTF32(
Op, DAG);
5909 return lowerFSQRTF64(
Op, DAG);
5914 return LowerTrig(
Op, DAG);
5916 return LowerSELECT(
Op, DAG);
5918 return LowerFDIV(
Op, DAG);
5920 return LowerFFREXP(
Op, DAG);
5922 return LowerATOMIC_CMP_SWAP(
Op, DAG);
5924 return LowerSTORE(
Op, DAG);
5928 return LowerGlobalAddress(MFI,
Op, DAG);
5931 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
5933 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
5935 return LowerINTRINSIC_VOID(
Op, DAG);
5937 return lowerADDRSPACECAST(
Op, DAG);
5939 return lowerINSERT_SUBVECTOR(
Op, DAG);
5941 return lowerINSERT_VECTOR_ELT(
Op, DAG);
5943 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
5945 return lowerVECTOR_SHUFFLE(
Op, DAG);
5947 return lowerSCALAR_TO_VECTOR(
Op, DAG);
5949 return lowerBUILD_VECTOR(
Op, DAG);
5952 return lowerFP_ROUND(
Op, DAG);
5954 return lowerTRAP(
Op, DAG);
5956 return lowerDEBUGTRAP(
Op, DAG);
5965 return lowerFMINNUM_FMAXNUM(
Op, DAG);
5968 return lowerFLDEXP(
Op, DAG);
5997 return lowerMUL(
Op, DAG);
6000 return lowerXMULO(
Op, DAG);
6003 return lowerXMUL_LOHI(
Op, DAG);
6036 EVT FittingLoadVT = LoadVT;
6068SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6071 bool IsIntrinsic)
const {
6075 EVT LoadVT =
M->getValueType(0);
6077 EVT EquivLoadVT = LoadVT;
6095 M->getMemoryVT(),
M->getMemOperand());
6106 EVT LoadVT =
M->getValueType(0);
6112 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
6113 bool IsTFE =
M->getNumValues() == 3;
6126 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
M->getMemOperand(),
6130 return getMemIntrinsicNode(Opc,
DL,
M->getVTList(), Ops, IntVT,
6131 M->getMemOperand(), DAG);
6136 SDValue MemNode = getMemIntrinsicNode(Opc,
DL, VTList, Ops, CastVT,
6137 M->getMemOperand(), DAG);
6145 EVT VT =
N->getValueType(0);
6146 unsigned CondCode =
N->getConstantOperandVal(3);
6157 EVT CmpVT =
LHS.getValueType();
6158 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6159 unsigned PromoteOp =
6179 EVT VT =
N->getValueType(0);
6181 unsigned CondCode =
N->getConstantOperandVal(3);
6190 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
6208 EVT VT =
N->getValueType(0);
6215 Src.getOperand(1), Src.getOperand(2));
6226 Exec = AMDGPU::EXEC_LO;
6228 Exec = AMDGPU::EXEC;
6245 EVT VT =
N->getValueType(0);
6247 unsigned IID =
N->getConstantOperandVal(0);
6248 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6249 IID == Intrinsic::amdgcn_permlanex16;
6250 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6251 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6255 unsigned SplitSize = 32;
6256 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6257 ST->hasDPALU_DPP() &&
6265 case Intrinsic::amdgcn_permlane16:
6266 case Intrinsic::amdgcn_permlanex16:
6267 case Intrinsic::amdgcn_update_dpp:
6272 case Intrinsic::amdgcn_writelane:
6275 case Intrinsic::amdgcn_readlane:
6276 case Intrinsic::amdgcn_set_inactive:
6277 case Intrinsic::amdgcn_set_inactive_chain_arg:
6278 case Intrinsic::amdgcn_mov_dpp8:
6281 case Intrinsic::amdgcn_readfirstlane:
6282 case Intrinsic::amdgcn_permlane64:
6292 if (
SDNode *GL =
N->getGluedNode()) {
6294 GL = GL->getOperand(0).getNode();
6304 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6305 IID == Intrinsic::amdgcn_mov_dpp8 ||
6306 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6307 Src1 =
N->getOperand(2);
6308 if (IID == Intrinsic::amdgcn_writelane ||
6309 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6310 Src2 =
N->getOperand(3);
6313 if (ValSize == SplitSize) {
6323 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6328 if (IID == Intrinsic::amdgcn_writelane) {
6333 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6335 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
6338 if (ValSize % SplitSize != 0)
6342 EVT VT =
N->getValueType(0);
6346 unsigned NumOperands =
N->getNumOperands();
6348 SDNode *GL =
N->getGluedNode();
6353 for (
unsigned i = 0; i != NE; ++i) {
6354 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6356 SDValue Operand =
N->getOperand(j);
6386 if (SplitSize == 32) {
6388 return unrollLaneOp(LaneOp.
getNode());
6394 unsigned SubVecNumElt =
6398 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6399 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6403 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6408 if (IID == Intrinsic::amdgcn_writelane)
6413 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6414 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6415 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6416 EltIdx += SubVecNumElt;
6430 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6433 if (IID == Intrinsic::amdgcn_writelane)
6436 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6444 switch (
N->getOpcode()) {
6456 unsigned IID =
N->getConstantOperandVal(0);
6458 case Intrinsic::amdgcn_make_buffer_rsrc:
6459 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
6461 case Intrinsic::amdgcn_cvt_pkrtz: {
6470 case Intrinsic::amdgcn_cvt_pknorm_i16:
6471 case Intrinsic::amdgcn_cvt_pknorm_u16:
6472 case Intrinsic::amdgcn_cvt_pk_i16:
6473 case Intrinsic::amdgcn_cvt_pk_u16: {
6479 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6481 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6483 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6488 EVT VT =
N->getValueType(0);
6497 case Intrinsic::amdgcn_s_buffer_load: {
6509 EVT VT =
Op.getValueType();
6510 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
6522 if (!
Offset->isDivergent()) {
6541 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
6553 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
6554 Results.push_back(Res.getOperand(
I));
6558 Results.push_back(Res.getValue(1));
6567 EVT VT =
N->getValueType(0);
6572 EVT SelectVT = NewVT;
6573 if (NewVT.
bitsLT(MVT::i32)) {
6576 SelectVT = MVT::i32;
6582 if (NewVT != SelectVT)
6588 if (
N->getValueType(0) != MVT::v2f16)
6600 if (
N->getValueType(0) != MVT::v2f16)
6612 if (
N->getValueType(0) != MVT::f16)
6627 if (U.get() !=
Value)
6630 if (U.getUser()->getOpcode() == Opcode)
6636unsigned SITargetLowering::isCFIntrinsic(
const SDNode *
Intr)
const {
6638 switch (
Intr->getConstantOperandVal(1)) {
6639 case Intrinsic::amdgcn_if:
6641 case Intrinsic::amdgcn_else:
6643 case Intrinsic::amdgcn_loop:
6645 case Intrinsic::amdgcn_end_cf:
6692 SDNode *
Intr = BRCOND.getOperand(1).getNode();
6705 assert(BR &&
"brcond missing unconditional branch user");
6706 Target = BR->getOperand(1);
6709 unsigned CFNode = isCFIntrinsic(
Intr);
6728 Ops.
append(
Intr->op_begin() + (HaveChain ? 2 : 1),
Intr->op_end());
6752 for (
unsigned i = 1, e =
Intr->getNumValues() - 1; i != e; ++i) {
6765 Intr->getOperand(0));
6771 MVT VT =
Op.getSimpleValueType();
6774 if (
Op.getConstantOperandVal(0) != 0)
6780 if (
Info->isEntryFunction())
6797 return Op.getValueType().bitsLE(VT)
6804 assert(
Op.getValueType() == MVT::f16 &&
6805 "Do not know how to custom lower FP_ROUND for non-f16 type");
6808 EVT SrcVT = Src.getValueType();
6809 if (SrcVT != MVT::f64)
6825 EVT VT =
Op.getValueType();
6828 bool IsIEEEMode =
Info->getMode().IEEE;
6837 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6845 EVT VT =
Op.getValueType();
6849 EVT ExpVT =
Exp.getValueType();
6850 if (ExpVT == MVT::i16)
6871 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
6878 switch (
Op->getOpcode()) {
6908 DAGCombinerInfo &DCI)
const {
6909 const unsigned Opc =
Op.getOpcode();
6917 :
Op->getOperand(0).getValueType();
6920 if (DCI.isBeforeLegalizeOps() ||
6924 auto &DAG = DCI.DAG;
6930 LHS =
Op->getOperand(1);
6931 RHS =
Op->getOperand(2);
6933 LHS =
Op->getOperand(0);
6934 RHS =
Op->getOperand(1);
6965 EVT VT =
Op.getValueType();
6971 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
6998 if (
Op->isDivergent())
7011 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7013 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7016 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7018 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7024 EVT VT =
Op.getValueType();
7031 const APInt &
C = RHSC->getAPIntValue();
7033 if (
C.isPowerOf2()) {
7035 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
7062 if (
Op->isDivergent()) {
7079 return lowerTrapEndpgm(
Op, DAG);
7082 : lowerTrapHsaQueuePtr(
Op, DAG);
7092SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
7094 ImplicitParameter Param)
const {
7114 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
7120 if (UserSGPR == AMDGPU::NoRegister) {
7162 "debugtrap handler not supported",
7175SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
7179 ? AMDGPU::SRC_SHARED_BASE
7180 : AMDGPU::SRC_PRIVATE_BASE;
7203 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7212 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
7218 if (UserSGPR == AMDGPU::NoRegister) {
7248 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7249 isa<BasicBlockSDNode>(Val))
7252 if (
auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7253 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7267 unsigned DestAS, SrcAS;
7269 bool IsNonNull =
false;
7270 if (
const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(
Op)) {
7271 SrcAS = ASC->getSrcAddressSpace();
7272 Src = ASC->getOperand(0);
7273 DestAS = ASC->getDestAddressSpace();
7276 Op.getConstantOperandVal(0) ==
7277 Intrinsic::amdgcn_addrspacecast_nonnull);
7278 Src =
Op->getOperand(1);
7279 SrcAS =
Op->getConstantOperandVal(2);
7280 DestAS =
Op->getConstantOperandVal(3);
7295 unsigned NullVal =
TM.getNullPointerValue(DestAS);
7309 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7317 unsigned NullVal =
TM.getNullPointerValue(SrcAS);
7329 Op.getValueType() == MVT::i64) {
7338 Src.getValueType() == MVT::i64)
7362 EVT InsVT =
Ins.getValueType();
7365 unsigned IdxVal =
Idx->getAsZExtVal();
7370 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
7375 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7377 MVT::i32, InsNumElts / 2);
7382 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
7384 if (InsNumElts == 2) {
7397 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
7419 auto *KIdx = dyn_cast<ConstantSDNode>(
Idx);
7420 if (NumElts == 4 && EltSize == 16 && KIdx) {
7431 unsigned Idx = KIdx->getZExtValue();
7432 bool InsertLo =
Idx < 2;
7449 if (isa<ConstantSDNode>(
Idx))
7455 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
7461 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7490 EVT ResultVT =
Op.getValueType();
7503 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
7506 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7510 if (VecSize == 128) {
7518 }
else if (VecSize == 256) {
7521 for (
unsigned P = 0;
P < 4; ++
P) {
7527 Parts[0], Parts[1]));
7529 Parts[2], Parts[3]));
7535 for (
unsigned P = 0;
P < 8; ++
P) {
7542 Parts[0], Parts[1], Parts[2], Parts[3]));
7545 Parts[4], Parts[5], Parts[6], Parts[7]));
7548 EVT IdxVT =
Idx.getValueType();
7565 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7580 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7590 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7595 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
7596 !(Mask[Elt + 1] & 1);
7602 EVT ResultVT =
Op.getValueType();
7605 const int NewSrcNumElts = 2;
7607 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
7623 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
7645 if (ShouldUseConsecutiveExtract &&
7648 int VecIdx =
Idx < SrcNumElts ? 0 : 1;
7649 int EltIdx =
Idx < SrcNumElts ?
Idx :
Idx - SrcNumElts;
7661 if (Idx0 >= SrcNumElts) {
7666 if (Idx1 >= SrcNumElts) {
7671 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
7672 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
7680 int NewMaskIdx0 = Idx0 - AlignedIdx0;
7681 int NewMaskIdx1 = Idx1 - AlignedIdx1;
7686 if (SubVec0 != SubVec1) {
7687 NewMaskIdx1 += NewSrcNumElts;
7694 {NewMaskIdx0, NewMaskIdx1});
7699 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7700 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7701 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7702 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7721 EVT ResultVT =
Op.getValueType();
7737 EVT VT =
Op.getValueType();
7739 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
7774 for (
unsigned P = 0;
P < NumParts; ++
P) {
7776 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
7809 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
7847 EVT PtrVT =
Op.getValueType();
7863 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
7934 SDValue Param = lowerKernargMemParameter(
7944 "non-hsa intrinsic with hsa target",
7953 "intrinsic not supported on subtarget",
7963 unsigned NumElts = Elts.
size();
7965 if (NumElts <= 12) {
7974 for (
unsigned i = 0; i < Elts.
size(); ++i) {
7980 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
7981 VecElts[i] = DAG.
getUNDEF(MVT::f32);
7990 EVT SrcVT = Src.getValueType();
8011 bool Unpacked,
bool IsD16,
int DMaskPop,
8012 int NumVDataDwords,
bool IsAtomicPacked16Bit,
8016 EVT ReqRetVT = ResultTypes[0];
8018 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
8019 ? (ReqRetNumElts + 1) / 2
8022 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
8033 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
8044 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
8046 NumDataDwords - MaskPopDwords);
8051 EVT LegalReqRetVT = ReqRetVT;
8053 if (!
Data.getValueType().isInteger())
8055 Data.getValueType().changeTypeToInteger(),
Data);
8076 if (Result->getNumValues() == 1)
8083 SDValue *LWE,
bool &IsTexFail) {
8084 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.
getNode());
8103 unsigned DimIdx,
unsigned EndIdx,
8104 unsigned NumGradients) {
8106 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
8114 if (((
I + 1) >= EndIdx) ||
8115 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
8116 I == DimIdx + NumGradients - 1))) {
8117 if (
Addr.getValueType() != MVT::i16)
8138 unsigned IntrOpcode =
Intr->BaseOpcode;
8149 int NumVDataDwords = 0;
8150 bool AdjustRetType =
false;
8151 bool IsAtomicPacked16Bit =
false;
8154 const unsigned ArgOffset = WithChain ? 2 : 1;
8157 unsigned DMaskLanes = 0;
8159 if (BaseOpcode->Atomic) {
8160 VData =
Op.getOperand(2);
8162 IsAtomicPacked16Bit =
8163 (
Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8164 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8167 if (BaseOpcode->AtomicX2) {
8174 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8175 DMask = Is64Bit ? 0xf : 0x3;
8176 NumVDataDwords = Is64Bit ? 4 : 2;
8178 DMask = Is64Bit ? 0x3 : 0x1;
8179 NumVDataDwords = Is64Bit ? 2 : 1;
8182 DMask =
Op->getConstantOperandVal(ArgOffset +
Intr->DMaskIndex);
8185 if (BaseOpcode->Store) {
8186 VData =
Op.getOperand(2);
8194 VData = handleD16VData(VData, DAG,
true);
8198 }
else if (!BaseOpcode->NoReturn) {
8211 (!LoadVT.
isVector() && DMaskLanes > 1))
8219 NumVDataDwords = (DMaskLanes + 1) / 2;
8221 NumVDataDwords = DMaskLanes;
8223 AdjustRetType =
true;
8227 unsigned VAddrEnd = ArgOffset +
Intr->VAddrEnd;
8232 Op.getOperand(ArgOffset +
Intr->GradientStart).getSimpleValueType();
8234 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8235 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8237 VAddrVT =
Op.getOperand(ArgOffset +
Intr->CoordStart).getSimpleValueType();
8239 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8240 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8243 for (
unsigned I =
Intr->VAddrStart; I < Intr->GradientStart;
I++) {
8244 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
8245 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
8250 {
Op.getOperand(ArgOffset +
I), DAG.
getUNDEF(MVT::f16)});
8254 "Bias needs to be converted to 16 bit in A16 mode");
8259 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
8263 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
8264 "require 16 bit args for both gradients and addresses");
8269 if (!
ST->hasA16()) {
8270 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
8271 "support 16 bit addresses\n");
8281 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
8285 IntrOpcode = G16MappingInfo->
G16;
8293 ArgOffset +
Intr->GradientStart,
8294 ArgOffset +
Intr->CoordStart,
Intr->NumGradients);
8296 for (
unsigned I = ArgOffset +
Intr->GradientStart;
8297 I < ArgOffset + Intr->CoordStart;
I++)
8304 ArgOffset +
Intr->CoordStart, VAddrEnd,
8308 for (
unsigned I = ArgOffset +
Intr->CoordStart;
I < VAddrEnd;
I++)
8326 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
8327 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
8328 const bool UseNSA =
ST->hasNSAEncoding() &&
8329 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
8330 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
8331 const bool UsePartialNSA =
8332 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
8335 if (UsePartialNSA) {
8337 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8338 }
else if (!UseNSA) {
8345 if (!BaseOpcode->Sampler) {
8349 Op.getConstantOperandVal(ArgOffset +
Intr->UnormIndex);
8351 Unorm = UnormConst ? True : False;
8356 SDValue TexFail =
Op.getOperand(ArgOffset +
Intr->TexFailCtrlIndex);
8357 bool IsTexFail =
false;
8358 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8369 NumVDataDwords += 1;
8370 AdjustRetType =
true;
8375 if (AdjustRetType) {
8378 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8381 if (isa<MemSDNode>(
Op))
8387 MVT::i32, NumVDataDwords)
8390 ResultTypes[0] = NewVT;
8391 if (ResultTypes.size() == 3) {
8395 ResultTypes.erase(&ResultTypes[1]);
8399 unsigned CPol =
Op.getConstantOperandVal(ArgOffset +
Intr->CachePolicyIndex);
8400 if (BaseOpcode->Atomic)
8407 if (BaseOpcode->Store || BaseOpcode->Atomic)
8409 if (UsePartialNSA) {
8418 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8421 if (BaseOpcode->Sampler) {
8430 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8434 ST->hasFeature(AMDGPU::FeatureR128A16)
8444 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8448 if (BaseOpcode->HasD16)
8450 if (isa<MemSDNode>(
Op))
8453 int NumVAddrDwords =
8459 NumVDataDwords, NumVAddrDwords);
8460 }
else if (IsGFX11Plus) {
8462 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8463 : AMDGPU::MIMGEncGfx11Default,
8464 NumVDataDwords, NumVAddrDwords);
8465 }
else if (IsGFX10Plus) {
8467 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8468 : AMDGPU::MIMGEncGfx10Default,
8469 NumVDataDwords, NumVAddrDwords);
8473 NumVDataDwords, NumVAddrDwords);
8476 "requested image instruction is not supported on this GPU");
8481 NumVDataDwords, NumVAddrDwords);
8484 NumVDataDwords, NumVAddrDwords);
8490 if (
auto *
MemOp = dyn_cast<MemSDNode>(
Op)) {
8495 if (BaseOpcode->AtomicX2) {
8500 if (BaseOpcode->NoReturn)
8504 NumVDataDwords, IsAtomicPacked16Bit,
DL);
8522 if (!
Offset->isDivergent()) {
8567 return handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
8571 unsigned NumLoads = 1;
8577 if (NumElts == 8 || NumElts == 16) {
8578 NumLoads = NumElts / 4;
8586 setBufferOffsets(
Offset, DAG, &Ops[3],
8587 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
8590 for (
unsigned i = 0; i < NumLoads; ++i) {
8596 if (NumElts == 8 || NumElts == 16)
8648 EVT VT =
Op.getValueType();
8650 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
8654 switch (IntrinsicID) {
8655 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8658 return getPreloadedValue(DAG, *MFI, VT,
8661 case Intrinsic::amdgcn_dispatch_ptr:
8662 case Intrinsic::amdgcn_queue_ptr: {
8665 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
8671 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
8674 return getPreloadedValue(DAG, *MFI, VT, RegID);
8676 case Intrinsic::amdgcn_implicitarg_ptr: {
8678 return getImplicitArgPtr(DAG,
DL);
8679 return getPreloadedValue(DAG, *MFI, VT,
8682 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8688 return getPreloadedValue(DAG, *MFI, VT,
8691 case Intrinsic::amdgcn_dispatch_id: {
8694 case Intrinsic::amdgcn_rcp:
8696 case Intrinsic::amdgcn_rsq:
8698 case Intrinsic::amdgcn_rsq_legacy:
8702 case Intrinsic::amdgcn_rcp_legacy:
8706 case Intrinsic::amdgcn_rsq_clamp: {
8720 case Intrinsic::r600_read_ngroups_x:
8724 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8727 case Intrinsic::r600_read_ngroups_y:
8731 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8734 case Intrinsic::r600_read_ngroups_z:
8738 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8741 case Intrinsic::r600_read_global_size_x:
8745 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8748 case Intrinsic::r600_read_global_size_y:
8752 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8755 case Intrinsic::r600_read_global_size_z:
8759 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8762 case Intrinsic::r600_read_local_size_x:
8766 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8768 case Intrinsic::r600_read_local_size_y:
8772 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8774 case Intrinsic::r600_read_local_size_z:
8778 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8780 case Intrinsic::amdgcn_workgroup_id_x:
8781 return getPreloadedValue(DAG, *MFI, VT,
8783 case Intrinsic::amdgcn_workgroup_id_y:
8784 return getPreloadedValue(DAG, *MFI, VT,
8786 case Intrinsic::amdgcn_workgroup_id_z:
8787 return getPreloadedValue(DAG, *MFI, VT,
8789 case Intrinsic::amdgcn_wave_id:
8790 return lowerWaveID(DAG,
Op);
8791 case Intrinsic::amdgcn_lds_kernel_id: {
8793 return getLDSKernelId(DAG,
DL);
8794 return getPreloadedValue(DAG, *MFI, VT,
8797 case Intrinsic::amdgcn_workitem_id_x:
8798 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
8799 case Intrinsic::amdgcn_workitem_id_y:
8800 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
8801 case Intrinsic::amdgcn_workitem_id_z:
8802 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
8803 case Intrinsic::amdgcn_wavefrontsize:
8806 case Intrinsic::amdgcn_s_buffer_load: {
8807 unsigned CPol =
Op.getConstantOperandVal(3);
8814 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
8815 Op.getOperand(3), DAG);
8817 case Intrinsic::amdgcn_fdiv_fast:
8818 return lowerFDIV_FAST(
Op, DAG);
8819 case Intrinsic::amdgcn_sin:
8822 case Intrinsic::amdgcn_cos:
8825 case Intrinsic::amdgcn_mul_u24:
8828 case Intrinsic::amdgcn_mul_i24:
8832 case Intrinsic::amdgcn_log_clamp: {
8838 case Intrinsic::amdgcn_fract:
8841 case Intrinsic::amdgcn_class:
8844 case Intrinsic::amdgcn_div_fmas:
8846 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
8848 case Intrinsic::amdgcn_div_fixup:
8850 Op.getOperand(2),
Op.getOperand(3));
8852 case Intrinsic::amdgcn_div_scale: {
8865 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
8868 Denominator, Numerator);
8870 case Intrinsic::amdgcn_icmp: {
8872 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
8873 Op.getConstantOperandVal(2) == 0 &&
8878 case Intrinsic::amdgcn_fcmp: {
8881 case Intrinsic::amdgcn_ballot:
8883 case Intrinsic::amdgcn_fmed3:
8885 Op.getOperand(2),
Op.getOperand(3));
8886 case Intrinsic::amdgcn_fdot2:
8888 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
8889 case Intrinsic::amdgcn_fmul_legacy:
8892 case Intrinsic::amdgcn_sffbh:
8894 case Intrinsic::amdgcn_sbfe:
8896 Op.getOperand(2),
Op.getOperand(3));
8897 case Intrinsic::amdgcn_ubfe:
8899 Op.getOperand(2),
Op.getOperand(3));
8900 case Intrinsic::amdgcn_cvt_pkrtz:
8901 case Intrinsic::amdgcn_cvt_pknorm_i16:
8902 case Intrinsic::amdgcn_cvt_pknorm_u16:
8903 case Intrinsic::amdgcn_cvt_pk_i16:
8904 case Intrinsic::amdgcn_cvt_pk_u16: {
8906 EVT VT =
Op.getValueType();
8909 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8911 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8913 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8915 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8921 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
8924 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
8927 case Intrinsic::amdgcn_fmad_ftz:
8929 Op.getOperand(2),
Op.getOperand(3));
8931 case Intrinsic::amdgcn_if_break:
8933 Op->getOperand(1),
Op->getOperand(2)),
8936 case Intrinsic::amdgcn_groupstaticsize: {
8948 case Intrinsic::amdgcn_is_shared:
8949 case Intrinsic::amdgcn_is_private: {
8951 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
8954 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8962 case Intrinsic::amdgcn_perm:
8964 Op.getOperand(2),
Op.getOperand(3));
8965 case Intrinsic::amdgcn_reloc_constant: {
8969 auto *RelocSymbol = cast<GlobalVariable>(
8975 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8976 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8977 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8978 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8979 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8980 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8981 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8982 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8983 if (
Op.getOperand(4).getValueType() == MVT::i32)
8989 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
8990 Op.getOperand(3), IndexKeyi32);
8992 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8993 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8994 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8995 if (
Op.getOperand(6).getValueType() == MVT::i32)
9001 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9002 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9003 IndexKeyi32, Op.getOperand(7)});
9005 case Intrinsic::amdgcn_addrspacecast_nonnull:
9006 return lowerADDRSPACECAST(
Op, DAG);
9007 case Intrinsic::amdgcn_readlane:
9008 case Intrinsic::amdgcn_readfirstlane:
9009 case Intrinsic::amdgcn_writelane:
9010 case Intrinsic::amdgcn_permlane16:
9011 case Intrinsic::amdgcn_permlanex16:
9012 case Intrinsic::amdgcn_permlane64:
9013 case Intrinsic::amdgcn_set_inactive:
9014 case Intrinsic::amdgcn_set_inactive_chain_arg:
9015 case Intrinsic::amdgcn_mov_dpp8:
9016 case Intrinsic::amdgcn_update_dpp:
9021 return lowerImage(
Op, ImageDimIntr, DAG,
false);
9032 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
9038 unsigned NewOpcode)
const {
9042 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9043 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9057 auto *
M = cast<MemSDNode>(
Op);
9061 M->getMemOperand());
9066 unsigned NewOpcode)
const {
9070 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9071 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9085 auto *
M = cast<MemSDNode>(
Op);
9089 M->getMemOperand());
9094 unsigned IntrID =
Op.getConstantOperandVal(1);
9098 case Intrinsic::amdgcn_ds_ordered_add:
9099 case Intrinsic::amdgcn_ds_ordered_swap: {
9104 unsigned IndexOperand =
M->getConstantOperandVal(7);
9105 unsigned WaveRelease =
M->getConstantOperandVal(8);
9106 unsigned WaveDone =
M->getConstantOperandVal(9);
9108 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9109 IndexOperand &= ~0x3f;
9110 unsigned CountDw = 0;
9113 CountDw = (IndexOperand >> 24) & 0xf;
9114 IndexOperand &= ~(0xf << 24);
9116 if (CountDw < 1 || CountDw > 4) {
9118 "ds_ordered_count: dword count must be between 1 and 4");
9125 if (WaveDone && !WaveRelease)
9128 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9129 unsigned ShaderType =
9131 unsigned Offset0 = OrderedCountIndex << 2;
9132 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (
Instruction << 4);
9135 Offset1 |= (CountDw - 1) << 6;
9138 Offset1 |= ShaderType << 2;
9140 unsigned Offset = Offset0 | (Offset1 << 8);
9147 M->getVTList(), Ops,
M->getMemoryVT(),
9148 M->getMemOperand());
9150 case Intrinsic::amdgcn_raw_buffer_load:
9151 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9152 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9153 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9154 case Intrinsic::amdgcn_raw_buffer_load_format:
9155 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9156 const bool IsFormat =
9157 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9158 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9160 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9161 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9174 auto *
M = cast<MemSDNode>(
Op);
9175 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9177 case Intrinsic::amdgcn_struct_buffer_load:
9178 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9179 case Intrinsic::amdgcn_struct_buffer_load_format:
9180 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9181 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9182 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9183 const bool IsFormat =
9184 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9185 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9187 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9188 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9201 return lowerIntrinsicLoad(cast<MemSDNode>(
Op), IsFormat, DAG, Ops);
9203 case Intrinsic::amdgcn_raw_tbuffer_load:
9204 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9206 EVT LoadVT =
Op.getValueType();
9207 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9208 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9227 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9230 case Intrinsic::amdgcn_struct_tbuffer_load:
9231 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9233 EVT LoadVT =
Op.getValueType();
9234 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9235 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9254 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9257 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9258 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9260 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9261 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9262 return lowerStructBufferAtomicIntrin(
Op, DAG,
9264 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9265 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9267 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9268 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9269 return lowerStructBufferAtomicIntrin(
Op, DAG,
9271 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9272 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9274 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9275 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9276 return lowerStructBufferAtomicIntrin(
Op, DAG,
9278 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9279 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9281 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9282 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9284 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9285 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9287 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9288 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9290 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9291 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9293 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9294 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9296 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9297 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9299 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9300 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9302 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9303 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9305 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9306 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9308 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9309 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9311 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9312 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9314 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9315 return lowerRawBufferAtomicIntrin(
Op, DAG,
9317 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9318 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9319 return lowerStructBufferAtomicIntrin(
Op, DAG,
9321 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9322 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9324 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9325 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9327 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9328 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9329 return lowerStructBufferAtomicIntrin(
Op, DAG,
9331 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9332 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9333 return lowerStructBufferAtomicIntrin(
Op, DAG,
9335 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9336 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9337 return lowerStructBufferAtomicIntrin(
Op, DAG,
9339 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9340 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9341 return lowerStructBufferAtomicIntrin(
Op, DAG,
9343 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9344 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9346 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9347 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9349 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9350 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9352 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9353 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9355 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9356 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9358 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9359 return lowerStructBufferAtomicIntrin(
Op, DAG,
9362 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9363 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9364 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
9365 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9379 EVT VT =
Op.getValueType();
9380 auto *
M = cast<MemSDNode>(
Op);
9383 Op->getVTList(), Ops, VT,
9384 M->getMemOperand());
9386 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9387 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9388 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
9389 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
9403 EVT VT =
Op.getValueType();
9404 auto *
M = cast<MemSDNode>(
Op);
9407 Op->getVTList(), Ops, VT,
9408 M->getMemOperand());
9410 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9412 SDValue NodePtr =
M->getOperand(2);
9413 SDValue RayExtent =
M->getOperand(3);
9414 SDValue RayOrigin =
M->getOperand(4);
9416 SDValue RayInvDir =
M->getOperand(6);
9434 const unsigned NumVDataDwords = 4;
9435 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9436 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9440 const unsigned BaseOpcodes[2][2] = {
9441 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9442 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9443 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9447 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9448 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9449 : AMDGPU::MIMGEncGfx10NSA,
9450 NumVDataDwords, NumVAddrDwords);
9454 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9455 : AMDGPU::MIMGEncGfx10Default,
9456 NumVDataDwords, NumVAddrDwords);
9462 auto packLanes = [&DAG, &Ops, &
DL](
SDValue Op,
bool IsAligned) {
9465 if (Lanes[0].getValueSizeInBits() == 32) {
9466 for (
unsigned I = 0;
I < 3; ++
I)
9485 if (UseNSA && IsGFX11Plus) {
9493 for (
unsigned I = 0;
I < 3; ++
I) {
9496 {DirLanes[I], InvDirLanes[I]})));
9511 packLanes(RayOrigin,
true);
9512 packLanes(RayDir,
true);
9513 packLanes(RayInvDir,
false);
9518 if (NumVAddrDwords > 12) {
9538 case Intrinsic::amdgcn_global_atomic_fmin_num:
9539 case Intrinsic::amdgcn_global_atomic_fmax_num:
9540 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9541 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9548 unsigned Opcode = 0;
9550 case Intrinsic::amdgcn_global_atomic_fmin_num:
9551 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9555 case Intrinsic::amdgcn_global_atomic_fmax_num:
9556 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9564 Ops,
M->getMemOperand());
9566 case Intrinsic::amdgcn_s_get_barrier_state:
9567 case Intrinsic::amdgcn_s_get_named_barrier_state: {
9572 if (isa<ConstantSDNode>(
Op->getOperand(2))) {
9573 uint64_t BarID = cast<ConstantSDNode>(
Op->getOperand(2))->getZExtValue();
9574 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9575 BarID = (BarID >> 4) & 0x3F;
9576 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9581 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9582 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9602 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9610SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
9620 bool IsTFE = VTList.
NumVTs == 3;
9623 unsigned NumOpDWords = NumValueDWords + 1;
9628 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList, Ops,
9629 OpDWordsVT, OpDWordsMMO, DAG);
9644 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9650 WidenedMemVT, WidenedMMO);
9660 bool ImageStore)
const {
9695 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
9701 if ((NumElements % 2) == 1) {
9703 unsigned I = Elts.
size() / 2;
9719 if (NumElements == 3) {
9740 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
9743 switch (IntrinsicID) {
9744 case Intrinsic::amdgcn_exp_compr: {
9748 "intrinsic not supported on subtarget",
DL.getDebugLoc());
9771 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9774 case Intrinsic::amdgcn_s_barrier:
9775 case Intrinsic::amdgcn_s_barrier_signal:
9776 case Intrinsic::amdgcn_s_barrier_wait: {
9779 unsigned WGSize =
ST.getFlatWorkGroupSizes(MF.
getFunction()).second;
9780 if (WGSize <=
ST.getWavefrontSize()) {
9783 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
9784 return Op.getOperand(0);
9787 MVT::Other,
Op.getOperand(0)),
9792 if (
ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
9798 MVT::Other, K,
Op.getOperand(0)),
9810 case Intrinsic::amdgcn_struct_tbuffer_store:
9811 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9815 VData = handleD16VData(VData, DAG);
9816 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9817 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9835 M->getMemoryVT(),
M->getMemOperand());
9838 case Intrinsic::amdgcn_raw_tbuffer_store:
9839 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9843 VData = handleD16VData(VData, DAG);
9844 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9845 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9863 M->getMemoryVT(),
M->getMemOperand());
9866 case Intrinsic::amdgcn_raw_buffer_store:
9867 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9868 case Intrinsic::amdgcn_raw_buffer_store_format:
9869 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9870 const bool IsFormat =
9871 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9872 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9879 VData = handleD16VData(VData, DAG);
9889 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9890 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9910 return handleByteShortBufferStores(DAG, VDataVT,
DL, Ops, M);
9913 M->getMemoryVT(),
M->getMemOperand());
9916 case Intrinsic::amdgcn_struct_buffer_store:
9917 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9918 case Intrinsic::amdgcn_struct_buffer_store_format:
9919 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9920 const bool IsFormat =
9921 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9922 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9930 VData = handleD16VData(VData, DAG);
9940 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9941 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9962 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9965 M->getMemoryVT(),
M->getMemOperand());
9967 case Intrinsic::amdgcn_raw_buffer_load_lds:
9968 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9969 case Intrinsic::amdgcn_struct_buffer_load_lds:
9970 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9974 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9975 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9976 unsigned OpOffset = HasVIndex ? 1 : 0;
9977 SDValue VOffset =
Op.getOperand(5 + OpOffset);
9979 unsigned Size =
Op->getConstantOperandVal(4);
9985 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9986 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9987 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9988 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9991 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9992 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9993 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9994 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9997 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9998 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9999 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
10000 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
10005 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
10006 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
10007 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
10008 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
10013 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
10014 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
10015 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
10016 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
10024 if (HasVIndex && HasVOffset)
10028 else if (HasVIndex)
10030 else if (HasVOffset)
10033 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10038 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
10050 auto *
M = cast<MemSDNode>(
Op);
10077 case Intrinsic::amdgcn_global_load_lds: {
10079 unsigned Size =
Op->getConstantOperandVal(4);
10084 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
10087 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
10090 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10095 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10100 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10104 auto *
M = cast<MemSDNode>(
Op);
10117 if (
LHS->isDivergent())
10121 RHS.getOperand(0).getValueType() == MVT::i32) {
10124 VOffset =
RHS.getOperand(0);
10129 if (!
Addr->isDivergent()) {
10146 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
10166 case Intrinsic::amdgcn_end_cf:
10168 Op->getOperand(2), Chain),
10170 case Intrinsic::amdgcn_s_barrier_init:
10171 case Intrinsic::amdgcn_s_barrier_signal_var: {
10178 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10179 ? AMDGPU::S_BARRIER_INIT_M0
10180 : AMDGPU::S_BARRIER_SIGNAL_M0;
10195 constexpr unsigned ShAmt = 16;
10207 case Intrinsic::amdgcn_s_barrier_join: {
10214 if (isa<ConstantSDNode>(BarOp)) {
10215 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10216 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10219 unsigned BarID = (BarVal >> 4) & 0x3F;
10224 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10240 case Intrinsic::amdgcn_s_prefetch_data: {
10243 return Op.getOperand(0);
10246 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10248 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
10255 Op->getVTList(), Ops,
M->getMemoryVT(),
10256 M->getMemOperand());
10261 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10274std::pair<SDValue, SDValue>
10281 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10284 C1 = cast<ConstantSDNode>(N0.
getOperand(1));
10298 unsigned Overflow = ImmOffset & ~MaxImm;
10299 ImmOffset -= Overflow;
10300 if ((int32_t)Overflow < 0) {
10301 Overflow += ImmOffset;
10306 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
10310 SDValue Ops[] = {N0, OverflowVal};
10325void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
10327 Align Alignment)
const {
10330 if (
auto *
C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10333 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10344 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10346 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
10363SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
10366 return MaybePointer;
10380 SDValue NumRecords =
Op->getOperand(3);
10383 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10386 std::optional<uint32_t> ConstStride = std::nullopt;
10387 if (
auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10388 ConstStride = ConstNode->getZExtValue();
10391 if (!ConstStride || *ConstStride != 0) {
10394 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
10405 NewHighHalf, NumRecords, Flags);
10415 bool IsTFE)
const {
10425 SDValue Op = getMemIntrinsicNode(Opc,
DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10453 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10457 Ops[1] = BufferStoreExt;
10462 M->getMemOperand());
10487 DAGCombinerInfo &DCI)
const {
10503 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
10510 "unexpected vector extload");
10523 "unexpected fp extload");
10541 DCI.AddToWorklist(Cvt.
getNode());
10546 DCI.AddToWorklist(Cvt.
getNode());
10557 if (
Info.isEntryFunction())
10558 return Info.getUserSGPRInfo().hasFlatScratchInit();
10566 EVT MemVT =
Load->getMemoryVT();
10579 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10607 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
10608 "Custom lowering for non-i32 vectors hasn't been implemented.");
10611 unsigned AS =
Load->getAddressSpace();
10635 Alignment >=
Align(4) && NumElements < 32) {
10649 if (NumElements > 4)
10668 if (NumElements > 2)
10673 if (NumElements > 4)
10685 auto Flags =
Load->getMemOperand()->getFlags();
10687 Load->getAlign(), Flags, &
Fast) &&
10696 MemVT, *
Load->getMemOperand())) {
10705 EVT VT =
Op.getValueType();
10742 EVT VT =
Op.getValueType();
10745 bool AllowInaccurateRcp =
10752 if (!AllowInaccurateRcp && VT != MVT::f16)
10755 if (CLHS->isExactlyValue(1.0)) {
10772 if (CLHS->isExactlyValue(-1.0)) {
10781 if (!AllowInaccurateRcp && (VT != MVT::f16 || !
Flags.hasAllowReciprocal()))
10795 EVT VT =
Op.getValueType();
10798 bool AllowInaccurateDiv =
10800 if (!AllowInaccurateDiv)
10821 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
10835 return DAG.
getNode(Opcode, SL, VTList,
10844 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
10858 return DAG.
getNode(Opcode, SL, VTList,
10864 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10865 return FastLowered;
10885 unsigned FMADOpCode =
10895 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10897 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
10898 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10921 const APFloat K0Val(0x1p+96f);
10924 const APFloat K1Val(0x1p-32f);
10951 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
10952 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
10953 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10958 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10959 return FastLowered;
10966 Flags.setNoFPExcept(
true);
10987 using namespace AMDGPU::Hwreg;
10988 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10996 const bool HasDynamicDenormals =
11002 if (!PreservesDenormals) {
11010 if (HasDynamicDenormals) {
11014 SavedDenormMode =
SDValue(GetReg, 0);
11022 const SDValue EnableDenormValue =
11029 const SDValue EnableDenormValue =
11031 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
11032 {EnableDenormValue,
BitField, Glue});
11042 ApproxRcp, One, NegDivScale0, Flags);
11045 ApproxRcp, Fma0, Flags);
11051 NumeratorScaled,
Mul, Flags);
11057 NumeratorScaled, Fma3, Flags);
11059 if (!PreservesDenormals) {
11067 DisableDenormValue, Fma4.
getValue(2))
11070 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
11071 const SDValue DisableDenormValue =
11072 HasDynamicDenormals
11077 AMDGPU::S_SETREG_B32, SL, MVT::Other,
11088 {Fma4, Fma1, Fma3, Scale},
Flags);
11094 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
11095 return FastLowered;
11163 EVT VT =
Op.getValueType();
11165 if (VT == MVT::f32)
11166 return LowerFDIV32(
Op, DAG);
11168 if (VT == MVT::f64)
11169 return LowerFDIV64(
Op, DAG);
11171 if (VT == MVT::f16)
11172 return LowerFDIV16(
Op, DAG);
11181 EVT ResultExpVT =
Op->getValueType(1);
11182 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11212 if (VT == MVT::i1) {
11216 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
11220 Store->getValue().getValueType().getScalarType() == MVT::i32);
11222 unsigned AS =
Store->getAddressSpace();
11241 if (NumElements > 4)
11248 VT, *
Store->getMemOperand()))
11258 if (NumElements > 2)
11262 if (NumElements > 4 ||
11271 auto Flags =
Store->getMemOperand()->getFlags();
11306 MVT VT =
Op.getValueType().getSimpleVT();
11477 EVT VT =
Op.getValueType();
11494 switch (
Op.getOpcode()) {
11521 EVT VT =
Op.getValueType();
11529 Op->getVTList(), Ops, VT,
11538SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
11539 DAGCombinerInfo &DCI)
const {
11540 EVT VT =
N->getValueType(0);
11542 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11549 EVT SrcVT = Src.getValueType();
11555 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11558 DCI.AddToWorklist(Cvt.
getNode());
11561 if (ScalarVT != MVT::f32) {
11573 DAGCombinerInfo &DCI)
const {
11574 SDValue MagnitudeOp =
N->getOperand(0);
11575 SDValue SignOp =
N->getOperand(1);
11631SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
11633 DAGCombinerInfo &DCI)
const {
11663 AM.HasBaseReg =
true;
11664 AM.BaseOffs =
Offset.getSExtValue();
11669 EVT VT =
N->getValueType(0);
11675 Flags.setNoUnsignedWrap(
11676 N->getFlags().hasNoUnsignedWrap() &&
11686 switch (
N->getOpcode()) {
11697 DAGCombinerInfo &DCI)
const {
11706 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
11707 N->getMemoryVT(), DCI);
11711 NewOps[PtrIdx] = NewPtr;
11720 return (Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11721 (Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11730SDValue SITargetLowering::splitBinaryBitConstantOp(
11731 DAGCombinerInfo &DCI,
const SDLoc &SL,
unsigned Opc,
SDValue LHS,
11751 if (V.getValueType() != MVT::i1)
11753 switch (V.getOpcode()) {
11772 if (!(
C & 0x000000ff))
11773 ZeroByteMask |= 0x000000ff;
11774 if (!(
C & 0x0000ff00))
11775 ZeroByteMask |= 0x0000ff00;
11776 if (!(
C & 0x00ff0000))
11777 ZeroByteMask |= 0x00ff0000;
11778 if (!(
C & 0xff000000))
11779 ZeroByteMask |= 0xff000000;
11780 uint32_t NonZeroByteMask = ~ZeroByteMask;
11781 if ((NonZeroByteMask &
C) != NonZeroByteMask)
11794 assert(V.getValueSizeInBits() == 32);
11796 if (V.getNumOperands() != 2)
11805 switch (V.getOpcode()) {
11810 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11815 return (0x03020100 & ~ConstMask) | ConstMask;
11822 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
11828 return uint32_t(0x0c0c0c0c03020100ull >>
C);
11835 DAGCombinerInfo &DCI)
const {
11836 if (DCI.isBeforeLegalize())
11840 EVT VT =
N->getValueType(0);
11845 if (VT == MVT::i64 && CRHS) {
11851 if (CRHS && VT == MVT::i32) {
11860 if (
auto *CShift = dyn_cast<ConstantSDNode>(
LHS->getOperand(1))) {
11861 unsigned Shift = CShift->getZExtValue();
11863 unsigned Offset = NB + Shift;
11864 if ((
Offset & (Bits - 1)) == 0) {
11882 isa<ConstantSDNode>(
LHS.getOperand(2))) {
11888 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11903 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
11908 if (
X !=
LHS.getOperand(1))
11913 dyn_cast<ConstantFPSDNode>(
RHS.getOperand(1));
11946 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
11947 LHS.getOperand(0) ==
LHS.getOperand(1))) {
11949 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
11950 :
Mask->getZExtValue() & OrdMask;
11971 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11974 if (LHSMask != ~0u && RHSMask != ~0u) {
11977 if (LHSMask > RHSMask) {
11984 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11985 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11988 if (!(LHSUsedLanes & RHSUsedLanes) &&
11991 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11998 for (
unsigned I = 0;
I < 32;
I += 8) {
12000 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
12001 Mask &= (0x0c <<
I) & 0xffffffff;
12059static const std::optional<ByteProvider<SDValue>>
12061 unsigned Depth = 0) {
12064 return std::nullopt;
12066 if (
Op.getValueSizeInBits() < 8)
12067 return std::nullopt;
12069 if (
Op.getValueType().isVector())
12072 switch (
Op->getOpcode()) {
12083 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
12084 NarrowVT = VTSign->getVT();
12087 return std::nullopt;
12090 if (SrcIndex >= NarrowByteWidth)
12091 return std::nullopt;
12097 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12099 return std::nullopt;
12101 uint64_t BitShift = ShiftOp->getZExtValue();
12103 if (BitShift % 8 != 0)
12104 return std::nullopt;
12106 SrcIndex += BitShift / 8;
12124static const std::optional<ByteProvider<SDValue>>
12126 unsigned StartingIndex = 0) {
12130 return std::nullopt;
12132 unsigned BitWidth =
Op.getScalarValueSizeInBits();
12134 return std::nullopt;
12136 return std::nullopt;
12138 bool IsVec =
Op.getValueType().isVector();
12139 switch (
Op.getOpcode()) {
12142 return std::nullopt;
12147 return std::nullopt;
12151 return std::nullopt;
12154 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
12155 return std::nullopt;
12156 if (!
LHS ||
LHS->isConstantZero())
12158 if (!
RHS ||
RHS->isConstantZero())
12160 return std::nullopt;
12165 return std::nullopt;
12167 auto *BitMaskOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12169 return std::nullopt;
12171 uint32_t BitMask = BitMaskOp->getZExtValue();
12173 uint32_t IndexMask = 0xFF << (Index * 8);
12175 if ((IndexMask & BitMask) != IndexMask) {
12178 if (IndexMask & BitMask)
12179 return std::nullopt;
12188 return std::nullopt;
12191 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12192 if (!ShiftOp ||
Op.getValueType().isVector())
12193 return std::nullopt;
12195 uint64_t BitsProvided =
Op.getValueSizeInBits();
12196 if (BitsProvided % 8 != 0)
12197 return std::nullopt;
12199 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12201 return std::nullopt;
12203 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12204 uint64_t ByteShift = BitShift / 8;
12206 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12207 uint64_t BytesProvided = BitsProvided / 8;
12208 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12209 NewIndex %= BytesProvided;
12216 return std::nullopt;
12218 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12220 return std::nullopt;
12222 uint64_t BitShift = ShiftOp->getZExtValue();
12224 return std::nullopt;
12226 auto BitsProvided =
Op.getScalarValueSizeInBits();
12227 if (BitsProvided % 8 != 0)
12228 return std::nullopt;
12230 uint64_t BytesProvided = BitsProvided / 8;
12231 uint64_t ByteShift = BitShift / 8;
12236 return BytesProvided - ByteShift > Index
12244 return std::nullopt;
12246 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12248 return std::nullopt;
12250 uint64_t BitShift = ShiftOp->getZExtValue();
12251 if (BitShift % 8 != 0)
12252 return std::nullopt;
12253 uint64_t ByteShift = BitShift / 8;
12259 return Index < ByteShift
12262 Depth + 1, StartingIndex);
12271 return std::nullopt;
12278 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
12279 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12281 if (NarrowBitWidth % 8 != 0)
12282 return std::nullopt;
12283 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12285 if (Index >= NarrowByteWidth)
12287 ? std::optional<ByteProvider<SDValue>>(
12295 return std::nullopt;
12299 if (NarrowByteWidth >= Index) {
12304 return std::nullopt;
12311 return std::nullopt;
12315 auto *L = cast<LoadSDNode>(
Op.getNode());
12317 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12318 if (NarrowBitWidth % 8 != 0)
12319 return std::nullopt;
12320 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12325 if (Index >= NarrowByteWidth) {
12327 ? std::optional<ByteProvider<SDValue>>(
12332 if (NarrowByteWidth > Index) {
12336 return std::nullopt;
12341 return std::nullopt;
12344 Depth + 1, StartingIndex);
12348 auto *IdxOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12350 return std::nullopt;
12351 auto VecIdx = IdxOp->getZExtValue();
12352 auto ScalarSize =
Op.getScalarValueSizeInBits();
12353 if (ScalarSize < 32)
12354 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12356 StartingIndex, Index);
12361 return std::nullopt;
12363 auto *PermMask = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12365 return std::nullopt;
12368 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12369 if (IdxMask > 0x07 && IdxMask != 0x0c)
12370 return std::nullopt;
12372 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12373 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12375 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
12381 return std::nullopt;
12396 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
12400 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12403 auto MemVT = L->getMemoryVT();
12406 return L->getMemoryVT().getSizeInBits() == 16;
12416 int Low8 = Mask & 0xff;
12417 int Hi8 = (Mask & 0xff00) >> 8;
12419 assert(Low8 < 8 && Hi8 < 8);
12421 bool IsConsecutive = (Hi8 - Low8 == 1);
12426 bool Is16Aligned = !(Low8 % 2);
12428 return IsConsecutive && Is16Aligned;
12436 int Low16 = PermMask & 0xffff;
12437 int Hi16 = (PermMask & 0xffff0000) >> 16;
12447 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12449 if (!OtherOpIs16Bit)
12457 unsigned DWordOffset) {
12460 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12462 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12467 if (Src.getValueType().isVector()) {
12468 auto ScalarTySize = Src.getScalarValueSizeInBits();
12469 auto ScalarTy = Src.getValueType().getScalarType();
12470 if (ScalarTySize == 32) {
12474 if (ScalarTySize > 32) {
12477 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12478 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12485 assert(ScalarTySize < 32);
12486 auto NumElements =
TypeSize / ScalarTySize;
12487 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12488 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12489 auto NumElementsIn32 = 32 / ScalarTySize;
12490 auto NumAvailElements = DWordOffset < Trunc32Elements
12492 : NumElements - NormalizedTrunc;
12505 auto ShiftVal = 32 * DWordOffset;
12513 [[maybe_unused]]
EVT VT =
N->getValueType(0);
12518 for (
int i = 0; i < 4; i++) {
12520 std::optional<ByteProvider<SDValue>>
P =
12523 if (!
P ||
P->isConstantZero())
12528 if (PermNodes.
size() != 4)
12531 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12532 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12534 for (
size_t i = 0; i < PermNodes.
size(); i++) {
12535 auto PermOp = PermNodes[i];
12538 int SrcByteAdjust = 4;
12542 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12543 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12545 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12546 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12550 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12551 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12554 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12556 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12559 SDValue Op = *PermNodes[FirstSrc.first].Src;
12561 assert(
Op.getValueSizeInBits() == 32);
12565 int Low16 = PermMask & 0xffff;
12566 int Hi16 = (PermMask & 0xffff0000) >> 16;
12568 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12569 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12572 if (WellFormedLow && WellFormedHi)
12576 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
12585 assert(
Op.getValueType().isByteSized() &&
12603 DAGCombinerInfo &DCI)
const {
12608 EVT VT =
N->getValueType(0);
12609 if (VT == MVT::i1) {
12614 if (Src !=
RHS.getOperand(0))
12619 if (!CLHS || !CRHS)
12623 static const uint32_t MaxMask = 0x3ff;
12638 isa<ConstantSDNode>(
LHS.getOperand(2))) {
12643 Sel |=
LHS.getConstantOperandVal(2);
12652 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12656 auto usesCombinedOperand = [](
SDNode *OrUse) {
12659 !OrUse->getValueType(0).isVector())
12663 for (
auto *VUser : OrUse->users()) {
12664 if (!VUser->getValueType(0).isVector())
12671 if (VUser->getOpcode() == VectorwiseOp)
12677 if (!
any_of(
N->users(), usesCombinedOperand))
12683 if (LHSMask != ~0u && RHSMask != ~0u) {
12686 if (LHSMask > RHSMask) {
12693 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12694 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12697 if (!(LHSUsedLanes & RHSUsedLanes) &&
12700 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12702 LHSMask &= ~RHSUsedLanes;
12703 RHSMask &= ~LHSUsedLanes;
12705 LHSMask |= LHSUsedLanes & 0x04040404;
12715 if (LHSMask == ~0u || RHSMask == ~0u) {
12721 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12736 if (SrcVT == MVT::i32) {
12741 DCI.AddToWorklist(LowOr.
getNode());
12742 DCI.AddToWorklist(HiBits.getNode());
12750 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(
N->getOperand(1));
12753 N->getOperand(0), CRHS))
12761 DAGCombinerInfo &DCI)
const {
12762 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
12771 EVT VT =
N->getValueType(0);
12772 if (CRHS && VT == MVT::i64) {
12794 LHS->getOperand(0), FNegLHS, FNegRHS);
12803 DAGCombinerInfo &DCI)
const {
12808 EVT VT =
N->getValueType(0);
12809 if (VT != MVT::i32)
12813 if (Src.getValueType() != MVT::i16)
12820SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
12821 DAGCombinerInfo &DCI)
const {
12823 auto *VTSign = cast<VTSDNode>(
N->getOperand(1));
12828 VTSign->getVT() == MVT::i8) ||
12830 VTSign->getVT() == MVT::i16))) {
12832 "s_buffer_load_{u8, i8} are supported "
12833 "in GFX12 (or newer) architectures.");
12834 EVT VT = Src.getValueType();
12839 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12845 auto *
M = cast<MemSDNode>(Src);
12846 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12847 Opc,
DL, ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12852 VTSign->getVT() == MVT::i8) ||
12854 VTSign->getVT() == MVT::i16)) &&
12856 auto *
M = cast<MemSDNode>(Src);
12857 SDValue Ops[] = {Src.getOperand(0),
12863 Src.getOperand(6), Src.getOperand(7)};
12866 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
12870 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
12871 Opc,
SDLoc(
N), ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12872 return DCI.DAG.getMergeValues(
12879 DAGCombinerInfo &DCI)
const {
12887 if (
N->getOperand(0).isUndef())
12894 DAGCombinerInfo &DCI)
const {
12895 EVT VT =
N->getValueType(0);
12920 unsigned MaxDepth)
const {
12921 unsigned Opcode =
Op.getOpcode();
12925 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(
Op)) {
12926 const auto &
F = CFP->getValueAPF();
12927 if (
F.isNaN() &&
F.isSignaling())
12929 if (!
F.isDenormal())
12992 if (
Op.getValueType() == MVT::i32) {
12997 if (
auto *
RHS = dyn_cast<ConstantSDNode>(
Op.getOperand(1))) {
12998 if (
RHS->getZExtValue() == 0xffff0000) {
13008 return Op.getValueType().getScalarType() != MVT::f16;
13076 if (
Op.getValueType() == MVT::i16) {
13087 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
13089 switch (IntrinsicID) {
13090 case Intrinsic::amdgcn_cvt_pkrtz:
13091 case Intrinsic::amdgcn_cubeid:
13092 case Intrinsic::amdgcn_frexp_mant:
13093 case Intrinsic::amdgcn_fdot2:
13094 case Intrinsic::amdgcn_rcp:
13095 case Intrinsic::amdgcn_rsq:
13096 case Intrinsic::amdgcn_rsq_clamp:
13097 case Intrinsic::amdgcn_rcp_legacy:
13098 case Intrinsic::amdgcn_rsq_legacy:
13099 case Intrinsic::amdgcn_trig_preop:
13100 case Intrinsic::amdgcn_log:
13101 case Intrinsic::amdgcn_exp2:
13102 case Intrinsic::amdgcn_sqrt:
13120 unsigned MaxDepth)
const {
13123 unsigned Opcode =
MI->getOpcode();
13125 if (Opcode == AMDGPU::G_FCANONICALIZE)
13128 std::optional<FPValueAndVReg> FCR;
13131 if (FCR->Value.isSignaling())
13133 if (!FCR->Value.isDenormal())
13144 case AMDGPU::G_FADD:
13145 case AMDGPU::G_FSUB:
13146 case AMDGPU::G_FMUL:
13147 case AMDGPU::G_FCEIL:
13148 case AMDGPU::G_FFLOOR:
13149 case AMDGPU::G_FRINT:
13150 case AMDGPU::G_FNEARBYINT:
13151 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13152 case AMDGPU::G_INTRINSIC_TRUNC:
13153 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13154 case AMDGPU::G_FMA:
13155 case AMDGPU::G_FMAD:
13156 case AMDGPU::G_FSQRT:
13157 case AMDGPU::G_FDIV:
13158 case AMDGPU::G_FREM:
13159 case AMDGPU::G_FPOW:
13160 case AMDGPU::G_FPEXT:
13161 case AMDGPU::G_FLOG:
13162 case AMDGPU::G_FLOG2:
13163 case AMDGPU::G_FLOG10:
13164 case AMDGPU::G_FPTRUNC:
13165 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13166 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13167 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13168 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13169 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13171 case AMDGPU::G_FNEG:
13172 case AMDGPU::G_FABS:
13173 case AMDGPU::G_FCOPYSIGN:
13175 case AMDGPU::G_FMINNUM:
13176 case AMDGPU::G_FMAXNUM:
13177 case AMDGPU::G_FMINNUM_IEEE:
13178 case AMDGPU::G_FMAXNUM_IEEE:
13179 case AMDGPU::G_FMINIMUM:
13180 case AMDGPU::G_FMAXIMUM: {
13188 case AMDGPU::G_BUILD_VECTOR:
13193 case AMDGPU::G_INTRINSIC:
13194 case AMDGPU::G_INTRINSIC_CONVERGENT:
13196 case Intrinsic::amdgcn_fmul_legacy:
13197 case Intrinsic::amdgcn_fmad_ftz:
13198 case Intrinsic::amdgcn_sqrt:
13199 case Intrinsic::amdgcn_fmed3:
13200 case Intrinsic::amdgcn_sin:
13201 case Intrinsic::amdgcn_cos:
13202 case Intrinsic::amdgcn_log:
13203 case Intrinsic::amdgcn_exp2:
13204 case Intrinsic::amdgcn_log_clamp:
13205 case Intrinsic::amdgcn_rcp:
13206 case Intrinsic::amdgcn_rcp_legacy:
13207 case Intrinsic::amdgcn_rsq:
13208 case Intrinsic::amdgcn_rsq_clamp:
13209 case Intrinsic::amdgcn_rsq_legacy:
13210 case Intrinsic::amdgcn_div_scale:
13211 case Intrinsic::amdgcn_div_fmas:
13212 case Intrinsic::amdgcn_div_fixup:
13213 case Intrinsic::amdgcn_fract:
13214 case Intrinsic::amdgcn_cvt_pkrtz:
13215 case Intrinsic::amdgcn_cubeid:
13216 case Intrinsic::amdgcn_cubema:
13217 case Intrinsic::amdgcn_cubesc:
13218 case Intrinsic::amdgcn_cubetc:
13219 case Intrinsic::amdgcn_frexp_mant:
13220 case Intrinsic::amdgcn_fdot2:
13221 case Intrinsic::amdgcn_trig_preop:
13240 if (
C.isDenormal()) {
13254 if (
C.isSignaling()) {
13273 return Op.isUndef() || isa<ConstantFPSDNode>(
Op);
13277SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
13278 DAGCombinerInfo &DCI)
const {
13281 EVT VT =
N->getValueType(0);
13290 EVT VT =
N->getValueType(0);
13291 return getCanonicalConstantFP(DAG,
SDLoc(
N), VT, CFP->getValueAPF());
13307 EVT EltVT =
Lo.getValueType();
13310 for (
unsigned I = 0;
I != 2; ++
I) {
13314 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
13315 }
else if (
Op.isUndef()) {
13327 if (isa<ConstantFPSDNode>(NewElts[1]))
13328 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
13334 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
13386 if (!MinK || !MaxK)
13399 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->
hasMed3_16()))
13400 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13441 if (
Info->getMode().DX10Clamp) {
13450 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->
hasMed3_16())) {
13482 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16());
13491 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
13500 DAGCombinerInfo &DCI)
const {
13503 EVT VT =
N->getValueType(0);
13504 unsigned Opc =
N->getOpcode();
13533 if (
SDValue Med3 = performIntMed3ImmCombine(
13538 if (
SDValue Med3 = performIntMed3ImmCombine(
13544 if (
SDValue Med3 = performIntMed3ImmCombine(
13549 if (
SDValue Med3 = performIntMed3ImmCombine(
13559 (VT == MVT::f32 || VT == MVT::f64 ||
13563 if (
SDValue Res = performFPMed3ImmCombine(DAG,
SDLoc(
N), Op0, Op1))
13574 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13575 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13584 DAGCombinerInfo &DCI)
const {
13585 EVT VT =
N->getValueType(0);
13608 if (
Info->getMode().DX10Clamp) {
13611 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13614 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13617 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13628 DAGCombinerInfo &DCI)
const {
13632 return DCI.DAG.getUNDEF(
N->getValueType(0));
13640 bool IsDivergentIdx,
13645 unsigned VecSize = EltSize * NumElem;
13648 if (VecSize <= 64 && EltSize < 32)
13657 if (IsDivergentIdx)
13661 unsigned NumInsts = NumElem +
13662 ((EltSize + 31) / 32) * NumElem ;
13667 return NumInsts <= 16;
13672 return NumInsts <= 15;
13679 if (isa<ConstantSDNode>(
Idx))
13693SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
13694 DAGCombinerInfo &DCI)
const {
13700 EVT ResVT =
N->getValueType(0);
13719 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13747 DCI.AddToWorklist(Elt0.
getNode());
13748 DCI.AddToWorklist(Elt1.
getNode());
13770 if (!DCI.isBeforeLegalize())
13776 auto *
Idx = dyn_cast<ConstantSDNode>(
N->getOperand(1));
13777 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.
isByteSized() &&
13778 VecSize > 32 && VecSize % 32 == 0 &&
Idx) {
13781 unsigned BitIndex =
Idx->getZExtValue() * VecEltSize;
13782 unsigned EltIdx = BitIndex / 32;
13783 unsigned LeftoverBitIdx = BitIndex % 32;
13787 DCI.AddToWorklist(Cast.
getNode());
13791 DCI.AddToWorklist(Elt.
getNode());
13794 DCI.AddToWorklist(Srl.
getNode());
13798 DCI.AddToWorklist(Trunc.
getNode());
13800 if (VecEltVT == ResVT) {
13812SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
13813 DAGCombinerInfo &DCI)
const {
13827 EVT IdxVT =
Idx.getValueType();
13844 Src.getOperand(0).getValueType() == MVT::f16) {
13845 return Src.getOperand(0);
13848 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13849 APFloat Val = CFP->getValueAPF();
13850 bool LosesInfo =
true;
13860 DAGCombinerInfo &DCI)
const {
13862 "combine only useful on gfx8");
13864 SDValue TruncSrc =
N->getOperand(0);
13865 EVT VT =
N->getValueType(0);
13866 if (VT != MVT::f16)
13904unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
13906 const SDNode *N1)
const {
13911 if (((VT == MVT::f32 &&
13913 (VT == MVT::f16 && Subtarget->
hasMadF16() &&
13933 EVT VT =
N->getValueType(0);
13934 if (VT != MVT::i32 && VT != MVT::i64)
13940 unsigned Opc =
N->getOpcode();
13963 return DAG.
getNode(Opc, SL, VT, Add1, Op2);
13995 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
14014 DAGCombinerInfo &DCI)
const {
14018 EVT VT =
N->getValueType(0);
14028 if (!
N->isDivergent() && Subtarget->
hasSMulHi())
14032 if (NumBits <= 32 || NumBits > 64)
14044 unsigned NumUsers = 0;
14072 bool MulSignedLo =
false;
14073 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
14082 if (VT != MVT::i64) {
14105 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
14107 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
14108 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
14110 if (!MulLHSUnsigned32) {
14117 if (!MulRHSUnsigned32) {
14128 if (VT != MVT::i64)
14134SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
14135 DAGCombinerInfo &DCI)
const {
14137 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14161 DAG.
getNode(
N->getOpcode(), SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
14172static std::optional<ByteProvider<SDValue>>
14175 if (!Byte0 || Byte0->isConstantZero()) {
14176 return std::nullopt;
14179 if (Byte1 && !Byte1->isConstantZero()) {
14180 return std::nullopt;
14186 unsigned FirstCs =
First & 0x0c0c0c0c;
14187 unsigned SecondCs = Second & 0x0c0c0c0c;
14188 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
14189 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14191 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14192 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14193 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14194 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14196 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14220 for (
int BPI = 0; BPI < 2; BPI++) {
14223 BPP = {Src1, Src0};
14225 unsigned ZeroMask = 0x0c0c0c0c;
14226 unsigned FMask = 0xFF << (8 * (3 - Step));
14228 unsigned FirstMask =
14229 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14230 unsigned SecondMask =
14231 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14235 int FirstGroup = -1;
14236 for (
int I = 0;
I < 2;
I++) {
14238 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
14239 return IterElt.SrcOp == *BPP.first.Src &&
14240 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14250 if (FirstGroup != -1) {
14252 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
14253 return IterElt.SrcOp == *BPP.second.Src &&
14254 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14260 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
14268 unsigned ZeroMask = 0x0c0c0c0c;
14269 unsigned FMask = 0xFF << (8 * (3 - Step));
14273 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14277 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14286 if (Srcs.
size() == 1) {
14287 auto *Elt = Srcs.
begin();
14291 if (Elt->PermMask == 0x3020100)
14298 auto *FirstElt = Srcs.
begin();
14299 auto *SecondElt = std::next(FirstElt);
14306 auto FirstMask = FirstElt->PermMask;
14307 auto SecondMask = SecondElt->PermMask;
14309 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14310 unsigned FirstPlusFour = FirstMask | 0x04040404;
14313 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14325 FirstElt = std::next(SecondElt);
14326 if (FirstElt == Srcs.
end())
14329 SecondElt = std::next(FirstElt);
14332 if (SecondElt == Srcs.
end()) {
14338 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
14344 return Perms.
size() == 2
14350 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14351 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14352 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14353 EntryMask += ZeroMask;
14358 auto Opcode =
Op.getOpcode();
14364static std::optional<bool>
14375 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14378 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14380 assert(!(S0IsUnsigned && S0IsSigned));
14381 assert(!(S1IsUnsigned && S1IsSigned));
14389 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14395 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14396 return std::nullopt;
14408 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14409 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14414 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14420 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14421 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14422 return std::nullopt;
14428 DAGCombinerInfo &DCI)
const {
14430 EVT VT =
N->getValueType(0);
14437 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
14442 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
14446 if (VT == MVT::i64) {
14447 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
14454 std::optional<bool> IsSigned;
14460 int ChainLength = 0;
14461 for (
int I = 0;
I < 4;
I++) {
14462 auto MulIdx =
isMul(LHS) ? 0 :
isMul(RHS) ? 1 : -1;
14465 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14468 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14473 TempNode->getOperand(MulIdx), *Src0, *Src1,
14474 TempNode->getOperand(MulIdx)->getOperand(0),
14475 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14479 IsSigned = *IterIsSigned;
14480 if (*IterIsSigned != *IsSigned)
14483 auto AddIdx = 1 - MulIdx;
14486 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
14487 Src2s.
push_back(TempNode->getOperand(AddIdx));
14497 TempNode->getOperand(AddIdx), *Src0, *Src1,
14498 TempNode->getOperand(AddIdx)->getOperand(0),
14499 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14503 if (*IterIsSigned != *IsSigned)
14507 ChainLength =
I + 2;
14511 TempNode = TempNode->getOperand(AddIdx);
14513 ChainLength =
I + 1;
14514 if (TempNode->getNumOperands() < 2)
14516 LHS = TempNode->getOperand(0);
14517 RHS = TempNode->getOperand(1);
14520 if (ChainLength < 2)
14526 if (ChainLength < 4) {
14536 bool UseOriginalSrc =
false;
14537 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
14538 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
14539 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
14540 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
14542 auto Src0Mask = Src0s.
begin()->PermMask;
14543 SrcBytes.
push_back(Src0Mask & 0xFF000000);
14544 bool UniqueEntries =
true;
14545 for (
auto I = 1;
I < 4;
I++) {
14546 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
14549 UniqueEntries =
false;
14555 if (UniqueEntries) {
14556 UseOriginalSrc =
true;
14558 auto *FirstElt = Src0s.
begin();
14562 auto *SecondElt = Src1s.
begin();
14564 SecondElt->DWordOffset);
14573 if (!UseOriginalSrc) {
14580 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14583 : Intrinsic::amdgcn_udot4,
14593 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14598 unsigned Opc =
LHS.getOpcode();
14603 Opc =
RHS.getOpcode();
14610 auto Cond =
RHS.getOperand(0);
14618 return DAG.
getNode(Opc, SL, VTList, Args);
14632 DAGCombinerInfo &DCI)
const {
14634 EVT VT =
N->getValueType(0);
14636 if (VT == MVT::i64) {
14637 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
14641 if (VT != MVT::i32)
14650 unsigned Opc =
RHS.getOpcode();
14657 auto Cond =
RHS.getOperand(0);
14665 return DAG.
getNode(Opc, SL, VTList, Args);
14680SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
14681 DAGCombinerInfo &DCI)
const {
14683 if (
N->getValueType(0) != MVT::i32)
14694 unsigned LHSOpc =
LHS.getOpcode();
14695 unsigned Opc =
N->getOpcode();
14705 DAGCombinerInfo &DCI)
const {
14710 EVT VT =
N->getValueType(0);
14722 if (
A ==
LHS.getOperand(1)) {
14723 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14724 if (FusedOp != 0) {
14726 return DAG.
getNode(FusedOp, SL, VT,
A, Two, RHS);
14734 if (
A ==
RHS.getOperand(1)) {
14735 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14736 if (FusedOp != 0) {
14738 return DAG.
getNode(FusedOp, SL, VT,
A, Two, LHS);
14747 DAGCombinerInfo &DCI)
const {
14753 EVT VT =
N->getValueType(0);
14766 if (
A ==
LHS.getOperand(1)) {
14767 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14768 if (FusedOp != 0) {
14772 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
14781 if (
A ==
RHS.getOperand(1)) {
14782 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14783 if (FusedOp != 0) {
14785 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo, LHS);
14794 DAGCombinerInfo &DCI)
const {
14797 EVT VT =
N->getValueType(0);
14811 bool IsNegative =
false;
14812 if (CLHS->isExactlyValue(1.0) ||
14813 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14829 DAGCombinerInfo &DCI)
const {
14831 EVT VT =
N->getValueType(0);
14845 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
14860 if (ScalarVT == MVT::f32 &&
14866 if (TrueNodeExpVal == INT_MIN)
14869 if (FalseNodeExpVal == INT_MIN)
14889 DAGCombinerInfo &DCI)
const {
14891 EVT VT =
N->getValueType(0);
14912 (
N->getFlags().hasAllowContract() &&
14913 FMA->getFlags().hasAllowContract())) {
14947 if (Vec1 == Vec2 || Vec3 == Vec4)
14953 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
14962 DAGCombinerInfo &DCI)
const {
14968 EVT VT =
LHS.getValueType();
14971 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14973 CRHS = dyn_cast<ConstantSDNode>(LHS);
14997 return LHS.getOperand(0);
15003 isa<ConstantSDNode>(
LHS.getOperand(1)) &&
15004 isa<ConstantSDNode>(
LHS.getOperand(2)) &&
15005 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
15012 const APInt &CT =
LHS.getConstantOperandAPInt(1);
15013 const APInt &CF =
LHS.getConstantOperandAPInt(2);
15021 return LHS.getOperand(0);
15025 if (VT != MVT::f32 && VT != MVT::f64 &&
15041 const unsigned IsInfMask =
15043 const unsigned IsFiniteMask =
15057SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
15058 DAGCombinerInfo &DCI)
const {
15076 if (
auto *
C = dyn_cast<ConstantSDNode>(Shift.
getOperand(1))) {
15080 unsigned ShiftOffset = 8 *
Offset;
15082 ShiftOffset -=
C->getZExtValue();
15084 ShiftOffset +=
C->getZExtValue();
15086 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
15088 MVT::f32, Shifted);
15099 DCI.AddToWorklist(
N);
15106 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
15112 DAGCombinerInfo &DCI)
const {
15122 return DCI.DAG.getConstantFP(Zero,
SDLoc(
N),
N->getValueType(0));
15125 APFloat One(
F.getSemantics(),
"1.0");
15127 return DCI.DAG.getConstantFP(One,
SDLoc(
N),
N->getValueType(0));
15134 switch (
N->getOpcode()) {
15150 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
15160 switch (
N->getOpcode()) {
15162 return performAddCombine(
N, DCI);
15164 return performSubCombine(
N, DCI);
15167 return performAddCarrySubCarryCombine(
N, DCI);
15169 return performFAddCombine(
N, DCI);
15171 return performFSubCombine(
N, DCI);
15173 return performFDivCombine(
N, DCI);
15175 return performFMulCombine(
N, DCI);
15177 return performSetCCCombine(
N, DCI);
15190 return performMinMaxCombine(
N, DCI);
15192 return performFMACombine(
N, DCI);
15194 return performAndCombine(
N, DCI);
15196 return performOrCombine(
N, DCI);
15199 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
15200 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15206 return performXorCombine(
N, DCI);
15208 return performZeroExtendCombine(
N, DCI);
15210 return performSignExtendInRegCombine(
N, DCI);
15212 return performClassCombine(
N, DCI);
15214 return performFCanonicalizeCombine(
N, DCI);
15216 return performRcpCombine(
N, DCI);
15231 return performUCharToFloatCombine(
N, DCI);
15233 return performFCopySignCombine(
N, DCI);
15238 return performCvtF32UByteNCombine(
N, DCI);
15240 return performFMed3Combine(
N, DCI);
15242 return performCvtPkRTZCombine(
N, DCI);
15244 return performClampCombine(
N, DCI);
15247 EVT VT =
N->getValueType(0);
15250 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15253 EVT EltVT = Src.getValueType();
15254 if (EltVT != MVT::i16)
15264 return performExtractVectorEltCombine(
N, DCI);
15266 return performInsertVectorEltCombine(
N, DCI);
15268 return performFPRoundCombine(
N, DCI);
15270 if (
SDValue Widened = widenLoad(cast<LoadSDNode>(
N), DCI))
15276 if (
MemSDNode *MemNode = dyn_cast<MemSDNode>(
N))
15277 return performMemSDNodeCombine(MemNode, DCI);
15308 unsigned Opcode =
Node->getMachineOpcode();
15311 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
15312 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
15317 unsigned DmaskIdx =
15318 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
15319 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
15320 unsigned NewDmask = 0;
15321 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
15322 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
15323 bool UsesTFC = ((int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
15324 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx)))
15327 unsigned TFCLane = 0;
15328 bool HasChain =
Node->getNumValues() > 1;
15330 if (OldDmask == 0) {
15338 TFCLane = OldBitsSet;
15345 if (
Use.getResNo() != 0)
15351 if (!
User->isMachineOpcode() ||
15352 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15364 if (UsesTFC && Lane == TFCLane) {
15369 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15371 Dmask &= ~(1 << Comp);
15379 NewDmask |= 1 << Comp;
15384 bool NoChannels = !NewDmask;
15391 if (OldBitsSet == 1)
15397 if (NewDmask == OldDmask)
15406 unsigned NewChannels = BitsSet + UsesTFC;
15410 assert(NewOpcode != -1 &&
15411 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
15412 "failed to find equivalent MIMG op");
15420 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
15422 MVT ResultVT = NewChannels == 1
15425 : NewChannels == 5 ? 8
15439 if (NewChannels == 1) {
15449 for (
unsigned i = 0,
Idx = AMDGPU::sub0; i < 5; ++i) {
15454 if (i || !NoChannels)
15459 if (NewUser !=
User) {
15469 Idx = AMDGPU::sub1;
15472 Idx = AMDGPU::sub2;
15475 Idx = AMDGPU::sub3;
15478 Idx = AMDGPU::sub4;
15489 Op =
Op.getOperand(0);
15491 return isa<FrameIndexSDNode>(
Op);
15501 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15502 SDValue SrcVal = Node->getOperand(2);
15510 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15512 SDNode *Glued = Node->getGluedNode();
15514 Node->getOperand(0), SL, VReg, SrcVal,
15520 return ToResultReg.
getNode();
15525 for (
unsigned i = 0; i < Node->getNumOperands(); ++i) {
15533 Node->getOperand(i).getValueType(),
15534 Node->getOperand(i)),
15546 unsigned Opcode = Node->getMachineOpcode();
15548 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
15549 !
TII->isGather4(Opcode) &&
15551 return adjustWritemask(Node, DAG);
15554 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
15560 case AMDGPU::V_DIV_SCALE_F32_e64:
15561 case AMDGPU::V_DIV_SCALE_F64_e64: {
15565 SDValue Src0 = Node->getOperand(1);
15566 SDValue Src1 = Node->getOperand(3);
15567 SDValue Src2 = Node->getOperand(5);
15571 (Src0 == Src1 || Src0 == Src2))
15627 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
15628 unsigned InitIdx = 0;
15630 if (
TII->isImage(
MI)) {
15638 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
15639 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
15640 unsigned D16Val = D16 ? D16->getImm() : 0;
15642 if (!TFEVal && !LWEVal)
15653 assert(MO_Dmask &&
"Expected dmask operand in instruction");
15655 unsigned dmask = MO_Dmask->
getImm();
15662 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15668 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15669 if (DstSize < InitIdx)
15672 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15680 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
15681 unsigned NewDst = 0;
15690 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15691 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
15711 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
15724 if (
TII->isVOP3(
MI.getOpcode())) {
15726 TII->legalizeOperandsVOP3(
MRI,
MI);
15731 if (!
MI.getDesc().operands().empty()) {
15732 unsigned Opc =
MI.getOpcode();
15733 bool HasAGPRs =
Info->mayNeedAGPRs();
15735 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15737 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15738 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15741 if ((
I == Src2Idx) && (HasAGPRs))
15744 if (!
Op.isReg() || !
Op.getReg().isVirtual())
15746 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
15747 if (!
TRI->hasAGPRs(RC))
15749 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
15750 if (!Src || !Src->isCopy() ||
15751 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
15753 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
15757 MRI.setRegClass(
Op.getReg(), NewRC);
15760 if (
TII->isMAI(
MI)) {
15765 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
15766 AMDGPU::OpName::scale_src0);
15767 if (Src0Idx != -1) {
15768 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
15769 AMDGPU::OpName::scale_src1);
15770 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
15771 TII->usesConstantBus(
MRI,
MI, Src1Idx))
15772 TII->legalizeOpWithMove(
MI, Src1Idx);
15780 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
15781 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15782 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
15783 if (
TRI->isVectorSuperClass(RC)) {
15784 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
15785 MRI.setRegClass(Src2->getReg(), NewRC);
15786 if (Src2->isTied())
15787 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
15796 if (
TII->isImage(
MI))
15797 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
15871std::pair<unsigned, const TargetRegisterClass *>
15878 if (Constraint.
size() == 1) {
15880 switch (Constraint[0]) {
15887 RC = &AMDGPU::SReg_32RegClass;
15890 RC = &AMDGPU::SGPR_64RegClass;
15895 return std::pair(0U,
nullptr);
15902 RC = &AMDGPU::VGPR_32RegClass;
15907 return std::pair(0U,
nullptr);
15916 RC = &AMDGPU::AGPR_32RegClass;
15921 return std::pair(0U,
nullptr);
15930 return std::pair(0U, RC);
15935 if (
RegName.consume_front(
"v")) {
15936 RC = &AMDGPU::VGPR_32RegClass;
15937 }
else if (
RegName.consume_front(
"s")) {
15938 RC = &AMDGPU::SGPR_32RegClass;
15939 }
else if (
RegName.consume_front(
"a")) {
15940 RC = &AMDGPU::AGPR_32RegClass;
15945 if (
RegName.consume_front(
"[")) {
15956 return std::pair(0U,
nullptr);
15959 RC =
TRI->getVGPRClassForBitWidth(Width);
15961 RC =
TRI->getSGPRClassForBitWidth(Width);
15963 RC =
TRI->getAGPRClassForBitWidth(Width);
15965 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15970 return std::pair(0U,
nullptr);
15972 return std::pair(Reg, RC);
15978 return std::pair(0U,
nullptr);
15980 if (!
Failed && Idx < RC->getNumRegs())
15988 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
15994 if (Constraint.
size() == 1) {
15995 switch (Constraint[0]) {
16005 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
16013 if (Constraint.
size() == 1) {
16014 switch (Constraint[0]) {
16031 Val = Val & maskTrailingOnes<uint64_t>(
Size);
16038 std::vector<SDValue> &Ops,
16053 unsigned Size =
Op.getScalarValueSizeInBits();
16061 Val =
C->getSExtValue();
16065 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
16071 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
16074 Val =
C->getSExtValue();
16078 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
16088 if (Constraint.
size() == 1) {
16089 switch (Constraint[0]) {
16093 return isInt<16>(Val);
16097 return isInt<32>(Val);
16104 }
else if (Constraint.
size() == 2) {
16105 if (Constraint ==
"DA") {
16106 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
16107 int64_t LoBits =
static_cast<int32_t
>(Val);
16111 if (Constraint ==
"DB") {
16119 unsigned MaxSize)
const {
16120 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
16123 MVT VT =
Op.getSimpleValueType();
16148 switch (UnalignedClassID) {
16149 case AMDGPU::VReg_64RegClassID:
16150 return AMDGPU::VReg_64_Align2RegClassID;
16151 case AMDGPU::VReg_96RegClassID:
16152 return AMDGPU::VReg_96_Align2RegClassID;
16153 case AMDGPU::VReg_128RegClassID:
16154 return AMDGPU::VReg_128_Align2RegClassID;
16155 case AMDGPU::VReg_160RegClassID:
16156 return AMDGPU::VReg_160_Align2RegClassID;
16157 case AMDGPU::VReg_192RegClassID:
16158 return AMDGPU::VReg_192_Align2RegClassID;
16159 case AMDGPU::VReg_224RegClassID:
16160 return AMDGPU::VReg_224_Align2RegClassID;
16161 case AMDGPU::VReg_256RegClassID:
16162 return AMDGPU::VReg_256_Align2RegClassID;
16163 case AMDGPU::VReg_288RegClassID:
16164 return AMDGPU::VReg_288_Align2RegClassID;
16165 case AMDGPU::VReg_320RegClassID:
16166 return AMDGPU::VReg_320_Align2RegClassID;
16167 case AMDGPU::VReg_352RegClassID:
16168 return AMDGPU::VReg_352_Align2RegClassID;
16169 case AMDGPU::VReg_384RegClassID:
16170 return AMDGPU::VReg_384_Align2RegClassID;
16171 case AMDGPU::VReg_512RegClassID:
16172 return AMDGPU::VReg_512_Align2RegClassID;
16173 case AMDGPU::VReg_1024RegClassID:
16174 return AMDGPU::VReg_1024_Align2RegClassID;
16175 case AMDGPU::AReg_64RegClassID:
16176 return AMDGPU::AReg_64_Align2RegClassID;
16177 case AMDGPU::AReg_96RegClassID:
16178 return AMDGPU::AReg_96_Align2RegClassID;
16179 case AMDGPU::AReg_128RegClassID:
16180 return AMDGPU::AReg_128_Align2RegClassID;
16181 case AMDGPU::AReg_160RegClassID:
16182 return AMDGPU::AReg_160_Align2RegClassID;
16183 case AMDGPU::AReg_192RegClassID:
16184 return AMDGPU::AReg_192_Align2RegClassID;
16185 case AMDGPU::AReg_256RegClassID:
16186 return AMDGPU::AReg_256_Align2RegClassID;
16187 case AMDGPU::AReg_512RegClassID:
16188 return AMDGPU::AReg_512_Align2RegClassID;
16189 case AMDGPU::AReg_1024RegClassID:
16190 return AMDGPU::AReg_1024_Align2RegClassID;
16206 if (
Info->isEntryFunction()) {
16213 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16215 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
16216 :
TRI->getAlignedHighSGPRForRC(MF, 2,
16217 &AMDGPU::SGPR_64RegClass);
16218 Info->setSGPRForEXECCopy(SReg);
16221 Info->getStackPtrOffsetReg()));
16222 if (
Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16223 MRI.replaceRegWith(AMDGPU::SP_REG,
Info->getStackPtrOffsetReg());
16227 if (
Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16228 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG,
Info->getScratchRSrcReg());
16230 if (
Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16231 MRI.replaceRegWith(AMDGPU::FP_REG,
Info->getFrameOffsetReg());
16233 Info->limitOccupancy(MF);
16235 if (ST.isWave32() && !MF.
empty()) {
16236 for (
auto &
MBB : MF) {
16237 for (
auto &
MI :
MBB) {
16238 TII->fixImplicitOperands(
MI);
16248 if (ST.needsAlignedVGPRs()) {
16249 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
16255 if (NewClassID != -1)
16256 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
16265 const APInt &DemandedElts,
16267 unsigned Depth)
const {
16269 unsigned Opc =
Op.getOpcode();
16272 unsigned IID =
Op.getConstantOperandVal(0);
16274 case Intrinsic::amdgcn_mbcnt_lo:
16275 case Intrinsic::amdgcn_mbcnt_hi: {
16281 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16291 Op, Known, DemandedElts, DAG,
Depth);
16306 unsigned MaxValue =
16315 switch (
MI->getOpcode()) {
16316 case AMDGPU::G_INTRINSIC:
16317 case AMDGPU::G_INTRINSIC_CONVERGENT: {
16320 case Intrinsic::amdgcn_workitem_id_x:
16323 case Intrinsic::amdgcn_workitem_id_y:
16326 case Intrinsic::amdgcn_workitem_id_z:
16329 case Intrinsic::amdgcn_mbcnt_lo:
16330 case Intrinsic::amdgcn_mbcnt_hi: {
16342 case Intrinsic::amdgcn_groupstaticsize: {
16353 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16356 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16359 case AMDGPU::G_AMDGPU_SMED3:
16360 case AMDGPU::G_AMDGPU_UMED3: {
16361 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
16388 unsigned Depth)
const {
16390 if (
auto *GI = dyn_cast<GIntrinsic>(
MI)) {
16396 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
16423 if (Header->getAlignment() != PrefAlign)
16424 return Header->getAlignment();
16426 unsigned LoopSize = 0;
16434 LoopSize +=
TII->getInstSizeInBytes(
MI);
16435 if (LoopSize > 192)
16440 if (LoopSize <= 64)
16443 if (LoopSize <= 128)
16444 return CacheLineAlign;
16450 auto I = Exit->getFirstNonDebugInstr();
16451 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16452 return CacheLineAlign;
16461 if (PreTerm == Pre->
begin() ||
16462 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
16466 auto ExitHead = Exit->getFirstNonDebugInstr();
16467 if (ExitHead == Exit->end() ||
16468 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
16473 return CacheLineAlign;
16481 N =
N->getOperand(0).getNode();
16491 switch (
N->getOpcode()) {
16499 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
16500 return !
TRI->isSGPRReg(
MRI, Reg);
16502 if (
const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
16506 return !
TRI->isSGPRReg(
MRI, Reg);
16510 unsigned AS = L->getAddressSpace();
16541 if (
auto *
A = dyn_cast<AtomicSDNode>(
N)) {
16543 return A->readMem() &&
A->writeMem();
16578 unsigned Depth)
const {
16583 if (
Info->getMode().DX10Clamp)
16595 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
16615 <<
"Hardware instruction generated for atomic "
16617 <<
" operation at memory scope " << MemScope;
16621 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16622 Type *EltTy = VT->getElementType();
16623 return VT->getNumElements() == 2 &&
16642 if (
auto *
IT = dyn_cast<IntegerType>(Ty)) {
16643 unsigned BW =
IT->getBitWidth();
16644 return BW == 32 || BW == 64;
16656 if (
PointerType *PT = dyn_cast<PointerType>(Ty)) {
16658 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
16659 return BW == 32 || BW == 64;
16666 return VT->getNumElements() == 2 &&
16667 VT->getElementType()->getPrimitiveSizeInBits() == 16;
16677 bool HasSystemScope) {
16684 if (HasSystemScope) {
16691 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
16704 const MDNode *NoaliasAddrSpaceMD =
16705 I->getMetadata(LLVMContext::MD_noalias_addrspace);
16706 if (!NoaliasAddrSpaceMD)
16709 for (
unsigned I = 0, E = NoaliasAddrSpaceMD->
getNumOperands() / 2;
I != E;
16711 auto *
Low = mdconst::extract<ConstantInt>(
16714 auto *
High = mdconst::extract<ConstantInt>(
16736 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
16749 bool HasSystemScope =
16936 if (HasSystemScope)
16988 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16989 return Subtarget->
isWave64() ? &AMDGPU::SReg_64RegClass
16990 : &AMDGPU::SReg_32RegClass;
16991 if (!
TRI->isSGPRClass(RC) && !isDivergent)
16992 return TRI->getEquivalentSGPRClass(RC);
16993 if (
TRI->isSGPRClass(RC) && isDivergent)
16994 return TRI->getEquivalentVGPRClass(RC);
17006 unsigned WaveSize) {
17011 if (!
IT ||
IT->getBitWidth() != WaveSize)
17014 if (!isa<Instruction>(V))
17016 if (!Visited.
insert(V).second)
17018 bool Result =
false;
17019 for (
const auto *U : V->users()) {
17020 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
17021 if (V == U->getOperand(1)) {
17022 switch (Intrinsic->getIntrinsicID()) {
17026 case Intrinsic::amdgcn_if_break:
17027 case Intrinsic::amdgcn_if:
17028 case Intrinsic::amdgcn_else:
17033 if (V == U->getOperand(0)) {
17034 switch (Intrinsic->getIntrinsicID()) {
17038 case Intrinsic::amdgcn_end_cf:
17039 case Intrinsic::amdgcn_loop:
17045 Result =
hasCFUser(U, Visited, WaveSize);
17054 const Value *V)
const {
17055 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
17056 if (CI->isInlineAsm()) {
17065 for (
auto &TC : TargetConstraints) {
17107 return MRI.hasOneNonDBGUse(N0);
17114 if (
I.getMetadata(
"amdgpu.noclobber"))
17116 if (
I.getMetadata(
"amdgpu.last.use"))
17126 if (!Def->isMachineOpcode())
17136 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
17137 PhysReg = AMDGPU::SCC;
17139 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
17148 if (!
I->hasOneUse())
17154 switch (
I->getOpcode()) {
17155 case Instruction::FMul: {
17156 if (
User->getOpcode() != Instruction::FSub &&
17157 User->getOpcode() != Instruction::FAdd)
17162 return ((!
I->hasAllowContract() || !
User->hasAllowContract()) &&
17221 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
17232 Alignment = RMW->getAlign();
17247 RMW->getType()->isFloatTy();
17250 bool ReturnValueIsUsed = !AI->
use_empty();
17259 if (FullFlatEmulation) {
17270 std::prev(BB->
end())->eraseFromParent();
17273 Value *LoadedShared =
nullptr;
17274 if (FullFlatEmulation) {
17276 Intrinsic::amdgcn_is_shared, {}, {
Addr},
nullptr,
"is.shared");
17277 Builder.
CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
17285 LoadedShared = Clone;
17292 Intrinsic::amdgcn_is_private, {}, {
Addr},
nullptr,
"is.private");
17300 Value *LoadedPrivate;
17303 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
17306 LoadedPrivate, RMW->getValOperand());
17310 auto [ResultLoad, Equal] =
17325 if (FullFlatEmulation) {
17335 if (!FullFlatEmulation) {
17340 MDNode *RangeNotPrivate =
17343 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
17351 if (ReturnValueIsUsed) {
17354 if (FullFlatEmulation)
17369 if (
const auto *ConstVal = dyn_cast<Constant>(AI->
getValOperand());
17370 ConstVal && ConstVal->isNullValue()) {
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
static bool isUndef(const MachineInstr &MI)
unsigned const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
bool isFPPredicate() const
bool isIntPredicate() const
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMemoryAtomicFaddF32DenormalSupport() const
bool hasD16Images() const
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasAtomicFMinFMaxF64FlatInsts() const
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasAtomicFMinFMaxF32FlatInsts() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasBCNT(unsigned Size) const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDot1Insts() const
bool hasAtomicFaddRtnInsts() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
bool enableFlatScratch() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
const SIFrameLowering * getFrameLowering() const override
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
bool hasAtomicFMinFMaxF64GlobalInsts() const
bool hasUnalignedScratchAccessEnabled() const
bool hasAtomicFlatPkAdd16Insts() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
bool supportsMinMaxDenormModes() const
bool hasAtomicFaddInsts() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
bool hasAtomicBufferPkAddBF16Inst() const
bool hasAtomicFaddNoRtnInsts() const
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDot8Insts() const
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasAtomicBufferGlobalPkAddF16Insts() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasAtomicGlobalPkAddBF16Inst() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
bool hasAddNoCarry() const
bool hasGWSAutoReplay() const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
int64_t getOffset() const
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
BasicBlock::iterator GetInsertPoint() const
BasicBlock * GetInsertBlock() const
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
LLVMContext & getContext() const
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
unsigned getNumOperands() const
Return number of MDNode operands.
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
constexpr size_t size() const
size - Get the string size.
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
A Use represents the edge between a Value definition and its users.
User * getUser() const
Returns the User that contains this Use.
unsigned getOperandNo() const
Return the operand # of this use in its User.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ System
Synchronized with respect to all concurrently executing threads.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
testing::Matcher< const detail::ErrorHolder & > Failed()
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
bool isUnknown() const
Returns true if we don't know any bits.
void resetAll()
Resets the known state of all bits.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const