40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
52#define DEBUG_TYPE "si-lower"
58 cl::desc(
"Do not align and prefetch loops"),
62 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
63 cl::desc(
"Use indirect register addressing for divergent indexes"),
70 cl::desc(
"Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
85 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
86 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
88 return AMDGPU::SGPR0 +
Reg;
160 if (Subtarget->has16BitInsts()) {
161 if (Subtarget->useRealTrue16Insts()) {
203 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
204 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
205 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
206 MVT::i1, MVT::v32i32},
210 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
211 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
212 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
213 MVT::i1, MVT::v32i32},
220 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
221 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
222 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
223 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
224 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
282 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
289 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
290 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
291 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
294 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
295 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
296 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
300 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
301 MVT::v3i16, MVT::v4i16, MVT::Other},
306 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
322 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
323 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
324 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
325 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
326 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
327 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
328 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
329 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
361 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
375 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
389 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
403 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
417 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
432 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
433 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
436 if (Subtarget->hasPkMovB32()) {
449 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
450 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
455 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
459 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
460 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
461 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
462 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
486 if (Subtarget->hasSMemRealTime() ||
491 if (Subtarget->has16BitInsts()) {
498 if (Subtarget->hasMadMacF32Insts())
501 if (!Subtarget->hasBFI())
505 if (!Subtarget->hasBCNT(32))
508 if (!Subtarget->hasBCNT(64))
511 if (Subtarget->hasFFBH())
514 if (Subtarget->hasFFBL())
525 if (Subtarget->hasBFE())
529 if (Subtarget->hasIntClamp())
532 if (Subtarget->hasAddNoCarry())
537 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
538 {MVT::f32, MVT::f64},
Custom);
544 {MVT::f32, MVT::f64},
Legal);
546 if (Subtarget->haveRoundOpsF64())
569 if (Subtarget->has16BitInsts()) {
618 ISD::FSIN, ISD::FROUND},
622 if (Subtarget->hasBF16TransInsts())
641 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
642 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
643 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
776 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
777 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
778 MVT::v32f16, MVT::v32bf16},
782 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
788 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
792 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
796 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
797 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
805 if (Subtarget->hasVOP3PInsts()) {
816 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
819 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
820 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
821 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
824 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
832 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
838 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
839 {MVT::v2f16, MVT::v4f16},
Custom);
845 if (Subtarget->hasPackedFP32Ops()) {
849 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
856 if (Subtarget->has16BitInsts()) {
869 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
870 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
871 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
872 MVT::v32f16, MVT::v32bf16},
877 if (Subtarget->hasVectorMulU64())
879 else if (Subtarget->hasScalarSMulU64())
882 if (Subtarget->hasMad64_32())
885 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
888 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
890 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
893 if (Subtarget->hasMinimum3Maximum3F32())
896 if (Subtarget->hasMinimum3Maximum3PKF16()) {
900 if (!Subtarget->hasMinimum3Maximum3F16())
905 if (Subtarget->hasVOP3PInsts()) {
908 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
912 if (Subtarget->hasIntMinMax64())
917 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
918 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
923 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
924 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
925 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
926 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
930 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
931 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
932 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
933 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
948 if (Subtarget->hasBF16ConversionInsts()) {
953 if (Subtarget->hasBF16PackedInsts()) {
959 if (Subtarget->hasBF16TransInsts()) {
963 if (Subtarget->hasCvtPkF16F32Inst()) {
965 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1015 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1024 ISD::ATOMIC_CMP_SWAP,
1025 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1027 ISD::ATOMIC_LOAD_ADD,
1028 ISD::ATOMIC_LOAD_SUB,
1029 ISD::ATOMIC_LOAD_AND,
1030 ISD::ATOMIC_LOAD_OR,
1031 ISD::ATOMIC_LOAD_XOR,
1032 ISD::ATOMIC_LOAD_NAND,
1033 ISD::ATOMIC_LOAD_MIN,
1034 ISD::ATOMIC_LOAD_MAX,
1035 ISD::ATOMIC_LOAD_UMIN,
1036 ISD::ATOMIC_LOAD_UMAX,
1037 ISD::ATOMIC_LOAD_FADD,
1038 ISD::ATOMIC_LOAD_FMIN,
1039 ISD::ATOMIC_LOAD_FMAX,
1040 ISD::ATOMIC_LOAD_UINC_WRAP,
1041 ISD::ATOMIC_LOAD_UDEC_WRAP,
1054 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1067 EVT DestVT,
EVT SrcVT)
const {
1069 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1070 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1072 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1079 LLT DestTy,
LLT SrcTy)
const {
1080 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1081 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1083 SrcTy.getScalarSizeInBits() == 16 &&
1104 if (Subtarget->has16BitInsts()) {
1107 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1109 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1113 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1135 if (
Size == 16 && Subtarget->has16BitInsts())
1136 return (NumElts + 1) / 2;
1142 return NumElts * ((
Size + 31) / 32);
1151 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1159 if (
Size == 16 && Subtarget->has16BitInsts()) {
1160 if (ScalarVT == MVT::bf16) {
1161 RegisterVT = MVT::i32;
1162 IntermediateVT = MVT::v2bf16;
1164 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1165 IntermediateVT = RegisterVT;
1167 NumIntermediates = (NumElts + 1) / 2;
1168 return NumIntermediates;
1173 IntermediateVT = RegisterVT;
1174 NumIntermediates = NumElts;
1175 return NumIntermediates;
1180 RegisterVT = MVT::i16;
1181 IntermediateVT = ScalarVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1187 RegisterVT = MVT::i32;
1188 IntermediateVT = ScalarVT;
1189 NumIntermediates = NumElts;
1190 return NumIntermediates;
1194 RegisterVT = MVT::i32;
1195 IntermediateVT = RegisterVT;
1196 NumIntermediates = NumElts * ((
Size + 31) / 32);
1197 return NumIntermediates;
1202 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1207 unsigned MaxNumLanes) {
1208 assert(MaxNumLanes != 0);
1212 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1223 unsigned MaxNumLanes) {
1229 assert(ST->getNumContainedTypes() == 2 &&
1230 ST->getContainedType(1)->isIntegerTy(32));
1244 return MVT::amdgpuBufferFatPointer;
1246 DL.getPointerSizeInBits(AS) == 192)
1247 return MVT::amdgpuBufferStridedPointer;
1256 DL.getPointerSizeInBits(AS) == 160) ||
1258 DL.getPointerSizeInBits(AS) == 192))
1265 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1266 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1267 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1269 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1270 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1271 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1272 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1273 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1275 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1276 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1277 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1278 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1279 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1281 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1282 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1283 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1284 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1285 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1324 unsigned IntrID)
const {
1326 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1344 if (RsrcIntr->IsImage) {
1359 Info.ptrVal = RsrcArg;
1362 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1371 if (RsrcIntr->IsImage) {
1372 unsigned MaxNumLanes = 4;
1387 std::numeric_limits<unsigned>::max());
1397 if (RsrcIntr->IsImage) {
1418 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1420 Info.memVT = MVT::i32;
1427 case Intrinsic::amdgcn_raw_buffer_load_lds:
1428 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1429 case Intrinsic::amdgcn_struct_buffer_load_lds:
1430 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1436 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1437 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1438 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1439 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1442 std::numeric_limits<unsigned>::max());
1452 case Intrinsic::amdgcn_ds_ordered_add:
1453 case Intrinsic::amdgcn_ds_ordered_swap: {
1466 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1467 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1470 Info.ptrVal =
nullptr;
1475 case Intrinsic::amdgcn_ds_append:
1476 case Intrinsic::amdgcn_ds_consume: {
1489 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1490 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1491 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1496 Info.memVT = MVT::i64;
1502 case Intrinsic::amdgcn_global_atomic_csub: {
1511 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1512 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1513 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1516 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1519 ->getElementType(0));
1527 case Intrinsic::amdgcn_global_atomic_fmin_num:
1528 case Intrinsic::amdgcn_global_atomic_fmax_num:
1529 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1530 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1531 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1532 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1542 case Intrinsic::amdgcn_flat_load_monitor_b32:
1543 case Intrinsic::amdgcn_flat_load_monitor_b64:
1544 case Intrinsic::amdgcn_flat_load_monitor_b128:
1545 case Intrinsic::amdgcn_global_load_monitor_b32:
1546 case Intrinsic::amdgcn_global_load_monitor_b64:
1547 case Intrinsic::amdgcn_global_load_monitor_b128:
1548 case Intrinsic::amdgcn_cluster_load_b32:
1549 case Intrinsic::amdgcn_cluster_load_b64:
1550 case Intrinsic::amdgcn_cluster_load_b128:
1551 case Intrinsic::amdgcn_ds_load_tr6_b96:
1552 case Intrinsic::amdgcn_ds_load_tr4_b64:
1553 case Intrinsic::amdgcn_ds_load_tr8_b64:
1554 case Intrinsic::amdgcn_ds_load_tr16_b128:
1555 case Intrinsic::amdgcn_global_load_tr6_b96:
1556 case Intrinsic::amdgcn_global_load_tr4_b64:
1557 case Intrinsic::amdgcn_global_load_tr_b64:
1558 case Intrinsic::amdgcn_global_load_tr_b128:
1559 case Intrinsic::amdgcn_ds_read_tr4_b64:
1560 case Intrinsic::amdgcn_ds_read_tr6_b96:
1561 case Intrinsic::amdgcn_ds_read_tr8_b64:
1562 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1570 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1571 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1572 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1580 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1581 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1582 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1590 case Intrinsic::amdgcn_ds_gws_init:
1591 case Intrinsic::amdgcn_ds_gws_barrier:
1592 case Intrinsic::amdgcn_ds_gws_sema_v:
1593 case Intrinsic::amdgcn_ds_gws_sema_br:
1594 case Intrinsic::amdgcn_ds_gws_sema_p:
1595 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1605 Info.memVT = MVT::i32;
1607 Info.align =
Align(4);
1609 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1615 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1616 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1617 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1618 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1619 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1620 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1621 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1622 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1629 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1630 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1631 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1632 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1639 case Intrinsic::amdgcn_load_to_lds:
1640 case Intrinsic::amdgcn_global_load_lds: {
1648 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1649 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1650 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1651 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1661 Info.memVT = MVT::i32;
1663 Info.align =
Align(4);
1668 case Intrinsic::amdgcn_s_prefetch_data:
1669 case Intrinsic::amdgcn_flat_prefetch:
1670 case Intrinsic::amdgcn_global_prefetch: {
1685 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1688 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1689 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1701 Type *&AccessTy)
const {
1703 switch (
II->getIntrinsicID()) {
1704 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1705 case Intrinsic::amdgcn_cluster_load_b128:
1706 case Intrinsic::amdgcn_cluster_load_b64:
1707 case Intrinsic::amdgcn_cluster_load_b32:
1708 case Intrinsic::amdgcn_ds_append:
1709 case Intrinsic::amdgcn_ds_consume:
1710 case Intrinsic::amdgcn_ds_load_tr8_b64:
1711 case Intrinsic::amdgcn_ds_load_tr16_b128:
1712 case Intrinsic::amdgcn_ds_load_tr4_b64:
1713 case Intrinsic::amdgcn_ds_load_tr6_b96:
1714 case Intrinsic::amdgcn_ds_read_tr4_b64:
1715 case Intrinsic::amdgcn_ds_read_tr6_b96:
1716 case Intrinsic::amdgcn_ds_read_tr8_b64:
1717 case Intrinsic::amdgcn_ds_read_tr16_b64:
1718 case Intrinsic::amdgcn_ds_ordered_add:
1719 case Intrinsic::amdgcn_ds_ordered_swap:
1720 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1721 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1722 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1723 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1724 case Intrinsic::amdgcn_flat_load_monitor_b128:
1725 case Intrinsic::amdgcn_flat_load_monitor_b32:
1726 case Intrinsic::amdgcn_flat_load_monitor_b64:
1727 case Intrinsic::amdgcn_global_atomic_csub:
1728 case Intrinsic::amdgcn_global_atomic_fmax_num:
1729 case Intrinsic::amdgcn_global_atomic_fmin_num:
1730 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1731 case Intrinsic::amdgcn_global_load_monitor_b128:
1732 case Intrinsic::amdgcn_global_load_monitor_b32:
1733 case Intrinsic::amdgcn_global_load_monitor_b64:
1734 case Intrinsic::amdgcn_global_load_tr_b64:
1735 case Intrinsic::amdgcn_global_load_tr_b128:
1736 case Intrinsic::amdgcn_global_load_tr4_b64:
1737 case Intrinsic::amdgcn_global_load_tr6_b96:
1738 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1739 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1740 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1741 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1742 Ptr =
II->getArgOperand(0);
1744 case Intrinsic::amdgcn_load_to_lds:
1745 case Intrinsic::amdgcn_global_load_lds:
1746 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1747 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1748 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1749 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1750 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1751 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1752 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1753 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1754 Ptr =
II->getArgOperand(1);
1759 AccessTy =
II->getType();
1765 unsigned AddrSpace)
const {
1766 if (!Subtarget->hasFlatInstOffsets()) {
1777 return AM.
Scale == 0 &&
1778 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1779 AM.
BaseOffs, AddrSpace, FlatVariant));
1783 if (Subtarget->hasFlatGlobalInsts())
1786 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1799 return isLegalMUBUFAddressingMode(AM);
1802bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1813 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1825 if (AM.HasBaseReg) {
1857 return isLegalMUBUFAddressingMode(AM);
1859 if (!Subtarget->hasScalarSubwordLoads()) {
1864 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1912 return Subtarget->enableFlatScratch()
1914 : isLegalMUBUFAddressingMode(AM);
1961 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1970 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
1973 Align RequiredAlignment(
1975 if (Subtarget->hasLDSMisalignedBug() &&
Size > 32 &&
1976 Alignment < RequiredAlignment)
1991 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
1997 RequiredAlignment =
Align(4);
1999 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2015 *IsFast = (Alignment >= RequiredAlignment) ? 64
2016 : (Alignment <
Align(4)) ? 32
2023 if (!Subtarget->hasDS96AndDS128())
2029 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2038 *IsFast = (Alignment >= RequiredAlignment) ? 96
2039 : (Alignment <
Align(4)) ? 32
2046 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2052 RequiredAlignment =
Align(8);
2054 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2063 *IsFast = (Alignment >= RequiredAlignment) ? 128
2064 : (Alignment <
Align(4)) ? 32
2081 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2083 return Alignment >= RequiredAlignment ||
2084 Subtarget->hasUnalignedDSAccessEnabled();
2092 bool AlignedBy4 = Alignment >=
Align(4);
2094 *IsFast = AlignedBy4;
2096 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
2105 return Alignment >=
Align(4) ||
2106 Subtarget->hasUnalignedBufferAccessEnabled();
2118 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2133 return Size >= 32 && Alignment >=
Align(4);
2138 unsigned *IsFast)
const {
2140 Alignment, Flags, IsFast);
2145 const AttributeList &FuncAttributes)
const {
2151 if (
Op.size() >= 16 &&
2155 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2173 unsigned DestAS)
const {
2176 Subtarget->hasGloballyAddressableScratch()) {
2206 unsigned Index)
const {
2222 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2249 auto [InputPtrReg, RC, ArgTy] =
2259 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2265 const SDLoc &SL)
const {
2272 const SDLoc &SL)
const {
2275 std::optional<uint32_t> KnownSize =
2277 if (KnownSize.has_value())
2303 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2312SDValue SITargetLowering::lowerKernargMemParameter(
2324 int64_t OffsetDiff =
Offset - AlignDownOffset;
2330 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2339 ArgVal = DAG.
getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2340 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2350 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2398 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2403SDValue SITargetLowering::getPreloadedValue(
2406 const ArgDescriptor *
Reg =
nullptr;
2407 const TargetRegisterClass *RC;
2411 const ArgDescriptor WorkGroupIDX =
2419 const ArgDescriptor WorkGroupIDZ =
2421 if (Subtarget->hasArchitectedSGPRs() &&
2426 Reg = &WorkGroupIDX;
2427 RC = &AMDGPU::SReg_32RegClass;
2431 Reg = &WorkGroupIDY;
2432 RC = &AMDGPU::SReg_32RegClass;
2436 Reg = &WorkGroupIDZ;
2437 RC = &AMDGPU::SReg_32RegClass;
2468 for (
unsigned I = 0,
E = Ins.size(), PSInputNum = 0;
I !=
E; ++
I) {
2472 "vector type argument should have been split");
2477 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2485 "unexpected vector split in ps argument type");
2499 Info->markPSInputAllocated(PSInputNum);
2501 Info->markPSInputEnabled(PSInputNum);
2517 if (Info.hasWorkItemIDX()) {
2523 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2527 if (Info.hasWorkItemIDY()) {
2528 assert(Info.hasWorkItemIDX());
2529 if (Subtarget->hasPackedTID()) {
2530 Info.setWorkItemIDY(
2533 unsigned Reg = AMDGPU::VGPR1;
2541 if (Info.hasWorkItemIDZ()) {
2542 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2543 if (Subtarget->hasPackedTID()) {
2544 Info.setWorkItemIDZ(
2547 unsigned Reg = AMDGPU::VGPR2;
2567 if (RegIdx == ArgVGPRs.
size()) {
2574 unsigned Reg = ArgVGPRs[RegIdx];
2586 unsigned NumArgRegs) {
2589 if (RegIdx == ArgSGPRs.
size())
2592 unsigned Reg = ArgSGPRs[RegIdx];
2634 const unsigned Mask = 0x3ff;
2637 if (Info.hasWorkItemIDX()) {
2639 Info.setWorkItemIDX(Arg);
2642 if (Info.hasWorkItemIDY()) {
2644 Info.setWorkItemIDY(Arg);
2647 if (Info.hasWorkItemIDZ())
2659 const unsigned Mask = 0x3ff;
2668 auto &
ArgInfo = Info.getArgInfo();
2680 if (Info.hasImplicitArgPtr())
2688 if (Info.hasWorkGroupIDX())
2691 if (Info.hasWorkGroupIDY())
2694 if (Info.hasWorkGroupIDZ())
2697 if (Info.hasLDSKernelId())
2708 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2709 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2715 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2716 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2721 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2722 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2728 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2734 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
2743 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2748 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
2749 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2754 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
2755 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2770 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2772 bool InPreloadSequence =
true;
2774 bool AlignedForImplictArgs =
false;
2775 unsigned ImplicitArgOffset = 0;
2776 for (
auto &Arg :
F.args()) {
2777 if (!InPreloadSequence || !Arg.hasInRegAttr())
2780 unsigned ArgIdx = Arg.getArgNo();
2783 if (InIdx < Ins.size() &&
2784 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2787 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2788 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2790 assert(ArgLocs[ArgIdx].isMemLoc());
2791 auto &ArgLoc = ArgLocs[InIdx];
2793 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2795 unsigned NumAllocSGPRs =
2796 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2799 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2800 if (!AlignedForImplictArgs) {
2802 alignTo(LastExplicitArgOffset,
2803 Subtarget->getAlignmentForImplicitArgPtr()) -
2804 LastExplicitArgOffset;
2805 AlignedForImplictArgs =
true;
2807 ArgOffset += ImplicitArgOffset;
2811 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2812 assert(InIdx >= 1 &&
"No previous SGPR");
2813 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2814 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2818 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2819 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2822 InPreloadSequence =
false;
2828 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2830 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2832 if (PreloadRegs->
size() > 1)
2833 RC = &AMDGPU::SGPR_32RegClass;
2834 for (
auto &Reg : *PreloadRegs) {
2840 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2849 if (Info.hasLDSKernelId()) {
2850 Register Reg = Info.addLDSKernelId();
2851 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2860 bool IsShader)
const {
2861 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2862 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2868 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2870 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2874 unsigned NumRequiredSystemSGPRs =
2875 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2876 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2877 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2878 Register Reg = Info.addReservedUserSGPR();
2879 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2884 if (!HasArchitectedSGPRs) {
2885 if (Info.hasWorkGroupIDX()) {
2886 Register Reg = Info.addWorkGroupIDX();
2887 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2891 if (Info.hasWorkGroupIDY()) {
2892 Register Reg = Info.addWorkGroupIDY();
2893 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2897 if (Info.hasWorkGroupIDZ()) {
2898 Register Reg = Info.addWorkGroupIDZ();
2899 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2904 if (Info.hasWorkGroupInfo()) {
2905 Register Reg = Info.addWorkGroupInfo();
2906 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2910 if (Info.hasPrivateSegmentWaveByteOffset()) {
2912 unsigned PrivateSegmentWaveByteOffsetReg;
2915 PrivateSegmentWaveByteOffsetReg =
2916 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2920 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2922 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2925 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2927 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2928 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2931 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2932 Info.getNumPreloadedSGPRs() >= 16);
2947 if (HasStackObjects)
2948 Info.setHasNonSpillStackObjects(
true);
2953 HasStackObjects =
true;
2957 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2959 if (!ST.enableFlatScratch()) {
2960 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2967 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2969 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2979 Info.setScratchRSrcReg(ReservedBufferReg);
2998 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2999 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3006 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3007 if (!
MRI.isLiveIn(
Reg)) {
3008 Info.setStackPtrOffsetReg(
Reg);
3013 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3020 if (ST.getFrameLowering()->hasFP(MF)) {
3021 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3037 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3046 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3047 RC = &AMDGPU::SGPR_64RegClass;
3048 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3049 RC = &AMDGPU::SGPR_32RegClass;
3055 Entry->addLiveIn(*
I);
3060 for (
auto *Exit : Exits)
3062 TII->get(TargetOpcode::COPY), *
I)
3077 bool IsError =
false;
3081 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3099 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3100 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3102 if (!Subtarget->enableFlatScratch())
3107 !Subtarget->hasArchitectedSGPRs())
3108 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3109 !Info->hasWorkGroupIDZ());
3112 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3130 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3131 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3134 Info->markPSInputAllocated(0);
3135 Info->markPSInputEnabled(0);
3137 if (Subtarget->isAmdPalOS()) {
3146 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3147 if ((PsInputBits & 0x7F) == 0 ||
3148 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3151 }
else if (IsKernel) {
3152 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3154 Splits.
append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3164 if (IsKernel && Subtarget->hasKernargPreload())
3168 }
else if (!IsGraphics) {
3173 if (!Subtarget->enableFlatScratch())
3185 Info->setNumWaveDispatchSGPRs(
3187 Info->setNumWaveDispatchVGPRs(
3189 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3190 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3195 if (IsWholeWaveFunc) {
3197 {MVT::i1, MVT::Other}, Chain);
3209 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3220 if (IsEntryFunc && VA.
isMemLoc()) {
3243 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3247 int64_t OffsetDiff =
Offset - AlignDownOffset;
3254 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3264 ArgVal = DAG.
getNode(ISD::BITCAST,
DL, MemVT, ArgVal);
3265 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3266 Ins[i].Flags.isSExt(), &Ins[i]);
3274 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3277 if (PreloadRegs.
size() == 1) {
3278 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3283 TRI->getRegSizeInBits(*RC)));
3291 for (
auto Reg : PreloadRegs) {
3298 PreloadRegs.size()),
3315 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3316 Ins[i].Flags.isSExt(), &Ins[i]);
3328 "hidden argument in kernel signature was not preloaded",
3334 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3335 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3355 if (!IsEntryFunc && VA.
isMemLoc()) {
3356 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3367 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3368 RC = &AMDGPU::VGPR_32RegClass;
3369 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3370 RC = &AMDGPU::SGPR_32RegClass;
3398 Val = DAG.
getNode(ISD::BITCAST,
DL, ValVT, Val);
3430 Info->setBytesInStackArgArea(StackArgSize);
3432 return Chains.
empty() ? Chain
3441 const Type *RetTy)
const {
3449 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3454 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3455 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3456 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3457 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3480 Info->setIfReturnsVoid(Outs.
empty());
3481 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3500 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3501 ++
I, ++RealRVLocIdx) {
3505 SDValue Arg = OutVals[RealRVLocIdx];
3528 ReadFirstLane, Arg);
3535 if (!Info->isEntryFunction()) {
3541 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3543 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3644 auto &ArgUsageInfo =
3646 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3672 const auto [OutgoingArg, ArgRC, ArgTy] =
3677 const auto [IncomingArg, IncomingArgRC, Ty] =
3679 assert(IncomingArgRC == ArgRC);
3682 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3690 InputReg = getImplicitArgPtr(DAG,
DL);
3692 std::optional<uint32_t> Id =
3694 if (Id.has_value()) {
3705 if (OutgoingArg->isRegister()) {
3706 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3707 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3710 unsigned SpecialArgOffset =
3721 auto [OutgoingArg, ArgRC, Ty] =
3724 std::tie(OutgoingArg, ArgRC, Ty) =
3727 std::tie(OutgoingArg, ArgRC, Ty) =
3742 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3743 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3744 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3749 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3757 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
3767 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
3776 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3777 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3788 : IncomingArgY ? *IncomingArgY
3795 if (OutgoingArg->isRegister()) {
3797 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3823 if (Callee->isDivergent())
3830 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3834 if (!CallerPreserved)
3837 bool CCMatch = CallerCC == CalleeCC;
3850 if (Arg.hasByValAttr())
3864 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3865 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3874 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3887 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
3889 if (!CCVA.isRegLoc())
3894 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3896 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
3920enum ChainCallArgIdx {
3942 bool UsesDynamicVGPRs =
false;
3943 if (IsChainCallConv) {
3948 auto RequestedExecIt =
3950 return Arg.OrigArgIndex == 2;
3952 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
3954 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
3957 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
3960 "Haven't popped all the special args");
3963 CLI.
Args[ChainCallArgIdx::Exec];
3964 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
3972 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
3974 ChainCallSpecialArgs.
push_back(Arg.Node);
3977 PushNodeOrTargetConstant(RequestedExecArg);
3983 if (FlagsValue.
isZero()) {
3984 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
3986 "no additional args allowed if flags == 0");
3988 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
3992 if (!Subtarget->isWave32()) {
3994 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
3997 UsesDynamicVGPRs =
true;
3998 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
3999 CLI.
Args.end(), PushNodeOrTargetConstant);
4008 bool IsSibCall =
false;
4022 "unsupported call to variadic function ");
4030 "unsupported required tail call to function ");
4035 Outs, OutVals, Ins, DAG);
4039 "site marked musttail or on llvm.amdgcn.cs.chain");
4046 if (!TailCallOpt && IsTailCall)
4086 auto *
TRI = Subtarget->getRegisterInfo();
4093 if (!IsSibCall || IsChainCallConv) {
4094 if (!Subtarget->enableFlatScratch()) {
4100 RegsToPass.emplace_back(IsChainCallConv
4101 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4102 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4109 const unsigned NumSpecialInputs = RegsToPass.size();
4111 MVT PtrVT = MVT::i32;
4114 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4142 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4150 int32_t
Offset = LocMemOffset;
4157 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4163 ? Flags.getNonZeroByValAlign()
4190 if (Outs[i].Flags.isByVal()) {
4192 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4195 Outs[i].Flags.getNonZeroByValAlign(),
4197 nullptr, std::nullopt, DstInfo,
4203 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4209 if (!MemOpChains.
empty())
4217 TokenGlue = DAG.
getNode(ISD::CONVERGENCECTRL_GLUE,
DL, MVT::Glue,
4225 unsigned ArgIdx = 0;
4226 for (
auto [Reg, Val] : RegsToPass) {
4227 if (ArgIdx++ >= NumSpecialInputs &&
4228 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4254 if (IsTailCall && !IsSibCall) {
4259 std::vector<SDValue>
Ops({Chain});
4265 Ops.push_back(Callee);
4282 Ops.push_back(Callee);
4293 if (IsChainCallConv)
4298 for (
auto &[Reg, Val] : RegsToPass)
4302 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4303 assert(Mask &&
"Missing call preserved mask for calling convention");
4313 MVT::Glue, GlueOps),
4318 Ops.push_back(InGlue);
4338 if (Info->isWholeWaveFunction())
4346 Chain =
Call.getValue(0);
4347 InGlue =
Call.getValue(1);
4349 uint64_t CalleePopBytes = NumBytes;
4370 EVT VT =
Op.getValueType();
4384 "Stack grows upwards for AMDGPU");
4386 Chain = BaseAddr.getValue(1);
4388 if (Alignment > StackAlign) {
4390 << Subtarget->getWavefrontSizeLog2();
4391 uint64_t StackAlignMask = ScaledAlignment - 1;
4398 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4404 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4415 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4431 if (
Op.getValueType() != MVT::i32)
4450 assert(
Op.getValueType() == MVT::i32);
4459 Op.getOperand(0), IntrinID, GetRoundBothImm);
4493 SDValue RoundModeTimesNumBits =
4513 TableEntry, EnumOffset);
4529 static_cast<uint32_t>(ConstMode->getZExtValue()),
4541 if (UseReducedTable) {
4547 SDValue RoundModeTimesNumBits =
4567 SDValue RoundModeTimesNumBits =
4576 NewMode = TruncTable;
4585 ReadFirstLaneID, NewMode);
4598 IntrinID, RoundBothImm, NewMode);
4604 if (
Op->isDivergent() &&
4605 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4615 if (Subtarget->hasSafeSmemPrefetch())
4623 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4632 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4633 EVT SrcVT = Src.getValueType();
4642 EVT DstVT =
Op.getValueType();
4646 return DAG.
getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4651 if (
Op.getValueType() != MVT::i64)
4665 Op.getOperand(0), IntrinID, ModeHwRegImm);
4667 Op.getOperand(0), IntrinID, TrapHwRegImm);
4674 SDValue Result = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4681 if (
Op.getOperand(1).getValueType() != MVT::i64)
4693 ReadFirstLaneID, NewModeReg);
4695 ReadFirstLaneID, NewTrapReg);
4697 unsigned ModeHwReg =
4700 unsigned TrapHwReg =
4708 IntrinID, ModeHwRegImm, NewModeReg);
4711 IntrinID, TrapHwRegImm, NewTrapReg);
4720 .
Case(
"m0", AMDGPU::M0)
4721 .
Case(
"exec", AMDGPU::EXEC)
4722 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4723 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4724 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4725 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4726 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4731 if (!Subtarget->hasFlatScrRegister() &&
4732 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4734 "\" for subtarget."));
4739 case AMDGPU::EXEC_LO:
4740 case AMDGPU::EXEC_HI:
4741 case AMDGPU::FLAT_SCR_LO:
4742 case AMDGPU::FLAT_SCR_HI:
4747 case AMDGPU::FLAT_SCR:
4766 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4775static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4797 auto Next = std::next(
I);
4808 MBB.addSuccessor(LoopBB);
4810 return std::pair(LoopBB, RemainderBB);
4817 auto I =
MI.getIterator();
4818 auto E = std::next(
I);
4840 Src->setIsKill(
false);
4850 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
4856 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4859 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4883 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4884 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4893 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4894 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4896 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4897 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4905 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4912 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4916 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4922 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4923 : AMDGPU::S_AND_SAVEEXEC_B64),
4927 MRI.setSimpleHint(NewExec, CondReg);
4929 if (UseGPRIdxMode) {
4931 SGPRIdxReg = CurrentIdxReg;
4933 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4934 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4944 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4951 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4954 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4955 : AMDGPU::S_XOR_B64_term),
4979 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4980 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4988 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
4990 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4991 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4992 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4993 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5008 InitResultReg, DstReg, PhiReg, TmpExec,
5009 Offset, UseGPRIdxMode, SGPRIdxReg);
5015 LoopBB->removeSuccessor(RemainderBB);
5017 LoopBB->addSuccessor(LandingPad);
5028static std::pair<unsigned, int>
5032 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5037 return std::pair(AMDGPU::sub0,
Offset);
5077 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5094 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5095 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5104 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5107 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5111 if (UseGPRIdxMode) {
5118 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5131 MI.eraseFromParent();
5140 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5141 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5147 UseGPRIdxMode, SGPRIdxReg);
5151 if (UseGPRIdxMode) {
5153 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5155 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5160 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5165 MI.eraseFromParent();
5182 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5192 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5194 if (Idx->
getReg() == AMDGPU::NoRegister) {
5205 MI.eraseFromParent();
5210 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5214 if (UseGPRIdxMode) {
5218 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5227 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5228 TRI.getRegSizeInBits(*VecRC), 32,
false);
5234 MI.eraseFromParent();
5244 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5248 UseGPRIdxMode, SGPRIdxReg);
5251 if (UseGPRIdxMode) {
5253 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5255 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5261 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5262 TRI.getRegSizeInBits(*VecRC), 32,
false);
5263 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5269 MI.eraseFromParent();
5285 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5286 if (ST.hasScalarAddSub64()) {
5287 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5297 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5298 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5301 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5303 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5306 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5308 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5310 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5311 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5320 MI.eraseFromParent();
5326 case AMDGPU::S_MIN_U32:
5327 return std::numeric_limits<uint32_t>::max();
5328 case AMDGPU::S_MIN_I32:
5329 return std::numeric_limits<int32_t>::max();
5330 case AMDGPU::S_MAX_U32:
5331 return std::numeric_limits<uint32_t>::min();
5332 case AMDGPU::S_MAX_I32:
5333 return std::numeric_limits<int32_t>::min();
5334 case AMDGPU::S_ADD_I32:
5335 case AMDGPU::S_SUB_I32:
5336 case AMDGPU::S_OR_B32:
5337 case AMDGPU::S_XOR_B32:
5338 return std::numeric_limits<uint32_t>::min();
5339 case AMDGPU::S_AND_B32:
5340 return std::numeric_limits<uint32_t>::max();
5343 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5349 case AMDGPU::V_CMP_LT_U64_e64:
5350 return std::numeric_limits<uint64_t>::max();
5351 case AMDGPU::V_CMP_LT_I64_e64:
5352 return std::numeric_limits<int64_t>::max();
5353 case AMDGPU::V_CMP_GT_U64_e64:
5354 return std::numeric_limits<uint64_t>::min();
5355 case AMDGPU::V_CMP_GT_I64_e64:
5356 return std::numeric_limits<int64_t>::min();
5357 case AMDGPU::S_ADD_U64_PSEUDO:
5358 case AMDGPU::S_SUB_U64_PSEUDO:
5359 return std::numeric_limits<uint64_t>::min();
5362 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5367 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5368 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5369 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5370 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5371 Opc == AMDGPU::S_XOR_B32;
5385 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5390 case AMDGPU::S_MIN_U32:
5391 case AMDGPU::S_MIN_I32:
5392 case AMDGPU::S_MAX_U32:
5393 case AMDGPU::S_MAX_I32:
5394 case AMDGPU::S_AND_B32:
5395 case AMDGPU::S_OR_B32: {
5401 case AMDGPU::V_CMP_LT_U64_e64:
5402 case AMDGPU::V_CMP_LT_I64_e64:
5403 case AMDGPU::V_CMP_GT_U64_e64:
5404 case AMDGPU::V_CMP_GT_I64_e64: {
5410 case AMDGPU::S_XOR_B32:
5411 case AMDGPU::S_ADD_I32:
5412 case AMDGPU::S_ADD_U64_PSEUDO:
5413 case AMDGPU::S_SUB_I32:
5414 case AMDGPU::S_SUB_U64_PSEUDO: {
5417 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5419 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5421 bool IsWave32 = ST.isWave32();
5422 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5423 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5424 unsigned BitCountOpc =
5425 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5429 auto NewAccumulator =
5434 case AMDGPU::S_XOR_B32: {
5440 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5443 .
addReg(NewAccumulator->getOperand(0).getReg())
5451 case AMDGPU::S_SUB_I32: {
5452 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5460 .
addReg(NewAccumulator->getOperand(0).getReg());
5463 case AMDGPU::S_ADD_I32: {
5466 .
addReg(NewAccumulator->getOperand(0).getReg());
5469 case AMDGPU::S_ADD_U64_PSEUDO:
5470 case AMDGPU::S_SUB_U64_PSEUDO: {
5471 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5472 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5474 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5476 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5477 Register CarryReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5478 Register AddReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5480 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5482 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5486 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5489 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5491 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5493 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5496 .
addReg(NewAccumulator->getOperand(0).getReg())
5506 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5508 : NewAccumulator->getOperand(0).getReg();
5519 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5525 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5531 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5563 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5564 Register IdentityValReg =
MRI.createVirtualRegister(DstRegClass);
5565 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5566 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5567 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5568 Register FF1Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5569 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
5571 bool IsWave32 = ST.isWave32();
5572 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5573 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5580 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5584 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5593 I = ComputeLoop->begin();
5595 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5599 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5603 I = ComputeLoop->end();
5606 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5610 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5619 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5621 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5622 Register LaneValReg =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5625 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5627 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5629 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5631 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5635 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5639 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5640 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5646 case AMDGPU::V_CMP_GT_I64_e64:
5647 case AMDGPU::V_CMP_GT_U64_e64:
5648 case AMDGPU::V_CMP_LT_I64_e64:
5649 case AMDGPU::V_CMP_LT_U64_e64: {
5650 Register LaneMaskReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5652 MRI.createVirtualRegister(WaveMaskRegClass);
5655 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5656 Register AccumulatorVReg =
MRI.createVirtualRegister(VregClass);
5659 VregClass, AMDGPU::sub0, VSubRegClass);
5662 VregClass, AMDGPU::sub1, VSubRegClass);
5663 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
5670 .
addReg(LaneValue->getOperand(0).getReg())
5671 .
addReg(AccumulatorVReg);
5673 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5674 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
5678 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5679 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5680 .
addReg(LaneValue->getOperand(0).getReg())
5684 case AMDGPU::S_ADD_U64_PSEUDO:
5685 case AMDGPU::S_SUB_U64_PSEUDO: {
5688 .
addReg(LaneValue->getOperand(0).getReg());
5695 unsigned BITSETOpc =
5696 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5697 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5703 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5706 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5708 .
addReg(NewActiveBitsReg)
5710 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5715 MI.eraseFromParent();
5727 switch (
MI.getOpcode()) {
5728 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5730 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5732 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5734 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5736 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5738 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5740 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5742 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5744 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5746 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5748 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5750 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5752 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5754 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5756 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5758 case AMDGPU::S_UADDO_PSEUDO:
5759 case AMDGPU::S_USUBO_PSEUDO: {
5766 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5768 : AMDGPU::S_SUB_I32;
5779 MI.eraseFromParent();
5782 case AMDGPU::S_ADD_U64_PSEUDO:
5783 case AMDGPU::S_SUB_U64_PSEUDO: {
5786 case AMDGPU::V_ADD_U64_PSEUDO:
5787 case AMDGPU::V_SUB_U64_PSEUDO: {
5793 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5799 if (ST.hasAddSubU64Insts()) {
5801 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5802 : AMDGPU::V_SUB_U64_e64),
5807 TII->legalizeOperands(*
I);
5808 MI.eraseFromParent();
5812 if (IsAdd && ST.hasLshlAddU64Inst()) {
5818 TII->legalizeOperands(*
Add);
5819 MI.eraseFromParent();
5823 const auto *CarryRC =
TRI->getWaveMaskRegClass();
5825 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5826 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5828 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
5829 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
5833 : &AMDGPU::VReg_64RegClass;
5836 : &AMDGPU::VReg_64RegClass;
5839 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5841 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5844 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5846 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5849 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5851 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5854 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5861 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5875 TII->legalizeOperands(*LoHalf);
5876 TII->legalizeOperands(*HiHalf);
5877 MI.eraseFromParent();
5880 case AMDGPU::S_ADD_CO_PSEUDO:
5881 case AMDGPU::S_SUB_CO_PSEUDO: {
5895 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5896 ? AMDGPU::S_ADDC_U32
5897 : AMDGPU::S_SUBB_U32;
5899 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5900 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5905 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5906 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5910 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5912 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5918 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
5919 assert(WaveSize == 64 || WaveSize == 32);
5921 if (WaveSize == 64) {
5922 if (ST.hasScalarCompareEq64()) {
5928 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5930 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5932 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5933 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5935 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
5956 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5962 MI.eraseFromParent();
5965 case AMDGPU::SI_INIT_M0: {
5968 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
5971 MI.eraseFromParent();
5974 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
5977 TII->get(AMDGPU::S_CMP_EQ_U32))
5982 case AMDGPU::GET_GROUPSTATICSIZE: {
5987 .
add(
MI.getOperand(0))
5989 MI.eraseFromParent();
5992 case AMDGPU::GET_SHADERCYCLESHILO: {
6007 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6009 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6010 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6012 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6013 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6015 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6019 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6024 .
add(
MI.getOperand(0))
6029 MI.eraseFromParent();
6032 case AMDGPU::SI_INDIRECT_SRC_V1:
6033 case AMDGPU::SI_INDIRECT_SRC_V2:
6034 case AMDGPU::SI_INDIRECT_SRC_V4:
6035 case AMDGPU::SI_INDIRECT_SRC_V8:
6036 case AMDGPU::SI_INDIRECT_SRC_V9:
6037 case AMDGPU::SI_INDIRECT_SRC_V10:
6038 case AMDGPU::SI_INDIRECT_SRC_V11:
6039 case AMDGPU::SI_INDIRECT_SRC_V12:
6040 case AMDGPU::SI_INDIRECT_SRC_V16:
6041 case AMDGPU::SI_INDIRECT_SRC_V32:
6043 case AMDGPU::SI_INDIRECT_DST_V1:
6044 case AMDGPU::SI_INDIRECT_DST_V2:
6045 case AMDGPU::SI_INDIRECT_DST_V4:
6046 case AMDGPU::SI_INDIRECT_DST_V8:
6047 case AMDGPU::SI_INDIRECT_DST_V9:
6048 case AMDGPU::SI_INDIRECT_DST_V10:
6049 case AMDGPU::SI_INDIRECT_DST_V11:
6050 case AMDGPU::SI_INDIRECT_DST_V12:
6051 case AMDGPU::SI_INDIRECT_DST_V16:
6052 case AMDGPU::SI_INDIRECT_DST_V32:
6054 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6055 case AMDGPU::SI_KILL_I1_PSEUDO:
6057 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6066 Register SrcCond =
MI.getOperand(3).getReg();
6068 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6069 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6070 const auto *CondRC =
TRI->getWaveMaskRegClass();
6071 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
6075 : &AMDGPU::VReg_64RegClass;
6078 : &AMDGPU::VReg_64RegClass;
6081 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6083 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6086 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6088 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6091 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6093 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6114 MI.eraseFromParent();
6117 case AMDGPU::SI_BR_UNDEF: {
6121 .
add(
MI.getOperand(0));
6123 MI.eraseFromParent();
6126 case AMDGPU::ADJCALLSTACKUP:
6127 case AMDGPU::ADJCALLSTACKDOWN: {
6134 case AMDGPU::SI_CALL_ISEL: {
6138 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
6141 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6147 MI.eraseFromParent();
6150 case AMDGPU::V_ADD_CO_U32_e32:
6151 case AMDGPU::V_SUB_CO_U32_e32:
6152 case AMDGPU::V_SUBREV_CO_U32_e32: {
6155 unsigned Opc =
MI.getOpcode();
6157 bool NeedClampOperand =
false;
6158 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
6160 NeedClampOperand =
true;
6164 if (
TII->isVOP3(*
I)) {
6169 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
6170 if (NeedClampOperand)
6173 TII->legalizeOperands(*
I);
6175 MI.eraseFromParent();
6178 case AMDGPU::V_ADDC_U32_e32:
6179 case AMDGPU::V_SUBB_U32_e32:
6180 case AMDGPU::V_SUBBREV_U32_e32:
6183 TII->legalizeOperands(
MI);
6185 case AMDGPU::DS_GWS_INIT:
6186 case AMDGPU::DS_GWS_SEMA_BR:
6187 case AMDGPU::DS_GWS_BARRIER:
6188 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
6190 case AMDGPU::DS_GWS_SEMA_V:
6191 case AMDGPU::DS_GWS_SEMA_P:
6192 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6200 case AMDGPU::S_SETREG_B32: {
6216 const unsigned SetMask = WidthMask <<
Offset;
6219 unsigned SetDenormOp = 0;
6220 unsigned SetRoundOp = 0;
6228 SetRoundOp = AMDGPU::S_ROUND_MODE;
6229 SetDenormOp = AMDGPU::S_DENORM_MODE;
6231 SetRoundOp = AMDGPU::S_ROUND_MODE;
6233 SetDenormOp = AMDGPU::S_DENORM_MODE;
6236 if (SetRoundOp || SetDenormOp) {
6239 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6240 unsigned ImmVal = Def->getOperand(1).getImm();
6254 MI.eraseFromParent();
6263 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6267 case AMDGPU::S_INVERSE_BALLOT_U32:
6268 case AMDGPU::S_INVERSE_BALLOT_U64:
6271 MI.setDesc(
TII->get(AMDGPU::COPY));
6273 case AMDGPU::ENDPGM_TRAP: {
6276 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6296 MI.eraseFromParent();
6299 case AMDGPU::SIMULATED_TRAP: {
6300 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6303 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6304 MI.eraseFromParent();
6307 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6308 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6314 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6315 Register OriginalExec = Setup->getOperand(0).getReg();
6317 MI.getOperand(0).setReg(OriginalExec);
6354 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6358 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6385 if (!Subtarget->hasMadMacF32Insts())
6386 return Subtarget->hasFastFMAF32();
6392 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6395 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6411 switch (Ty.getScalarSizeInBits()) {
6429 if (Ty.getScalarSizeInBits() == 16)
6431 if (Ty.getScalarSizeInBits() == 32)
6432 return Subtarget->hasMadMacF32Insts() &&
6442 EVT VT =
N->getValueType(0);
6444 return Subtarget->hasMadMacF32Insts() &&
6446 if (VT == MVT::f16) {
6447 return Subtarget->hasMadF16() &&
6462 unsigned Opc =
Op.getOpcode();
6463 EVT VT =
Op.getValueType();
6464 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6465 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6466 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6467 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6482 unsigned Opc =
Op.getOpcode();
6483 EVT VT =
Op.getValueType();
6484 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6485 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6486 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6487 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6488 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6489 VT == MVT::v32bf16);
6497 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6499 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6506 unsigned Opc =
Op.getOpcode();
6507 EVT VT =
Op.getValueType();
6508 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6509 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6510 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6511 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6512 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6513 VT == MVT::v32bf16);
6518 : std::pair(Op0, Op0);
6527 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
6529 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
6535 switch (
Op.getOpcode()) {
6539 return LowerBRCOND(
Op, DAG);
6541 return LowerRETURNADDR(
Op, DAG);
6544 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6545 "Load should return a value and a chain");
6549 EVT VT =
Op.getValueType();
6551 return lowerFSQRTF32(
Op, DAG);
6553 return lowerFSQRTF64(
Op, DAG);
6558 return LowerTrig(
Op, DAG);
6560 return LowerSELECT(
Op, DAG);
6562 return LowerFDIV(
Op, DAG);
6564 return LowerFFREXP(
Op, DAG);
6565 case ISD::ATOMIC_CMP_SWAP:
6566 return LowerATOMIC_CMP_SWAP(
Op, DAG);
6568 return LowerSTORE(
Op, DAG);
6572 return LowerGlobalAddress(MFI,
Op, DAG);
6575 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
6577 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
6579 return LowerINTRINSIC_VOID(
Op, DAG);
6580 case ISD::ADDRSPACECAST:
6581 return lowerADDRSPACECAST(
Op, DAG);
6583 return lowerINSERT_SUBVECTOR(
Op, DAG);
6585 return lowerINSERT_VECTOR_ELT(
Op, DAG);
6587 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
6589 return lowerVECTOR_SHUFFLE(
Op, DAG);
6591 return lowerSCALAR_TO_VECTOR(
Op, DAG);
6593 return lowerBUILD_VECTOR(
Op, DAG);
6596 return lowerFP_ROUND(
Op, DAG);
6598 return lowerTRAP(
Op, DAG);
6599 case ISD::DEBUGTRAP:
6600 return lowerDEBUGTRAP(
Op, DAG);
6609 return lowerFMINNUM_FMAXNUM(
Op, DAG);
6610 case ISD::FMINIMUMNUM:
6611 case ISD::FMAXIMUMNUM:
6612 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
6615 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
6618 return lowerFLDEXP(
Op, DAG);
6635 case ISD::FMINNUM_IEEE:
6636 case ISD::FMAXNUM_IEEE:
6643 return lowerFCOPYSIGN(
Op, DAG);
6645 return lowerMUL(
Op, DAG);
6648 return lowerXMULO(
Op, DAG);
6651 return lowerXMUL_LOHI(
Op, DAG);
6652 case ISD::DYNAMIC_STACKALLOC:
6654 case ISD::STACKSAVE:
6658 case ISD::SET_ROUNDING:
6662 case ISD::FP_EXTEND:
6665 case ISD::GET_FPENV:
6667 case ISD::SET_FPENV:
6684 EVT FittingLoadVT = LoadVT;
6709 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6713 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6716SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6719 bool IsIntrinsic)
const {
6722 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6723 EVT LoadVT =
M->getValueType(0);
6725 EVT EquivLoadVT = LoadVT;
6739 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
6743 M->getMemoryVT(),
M->getMemOperand());
6754 EVT LoadVT =
M->getValueType(0);
6760 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
6761 bool IsTFE =
M->getNumValues() == 3;
6774 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
6778 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
6779 M->getMemOperand(), DAG);
6783 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
6785 M->getMemOperand(), DAG);
6793 EVT VT =
N->getValueType(0);
6794 unsigned CondCode =
N->getConstantOperandVal(3);
6805 EVT CmpVT =
LHS.getValueType();
6806 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6807 unsigned PromoteOp =
6827 EVT VT =
N->getValueType(0);
6829 unsigned CondCode =
N->getConstantOperandVal(3);
6838 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
6839 Src0 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6840 Src1 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6856 EVT VT =
N->getValueType(0);
6863 Src.getOperand(1), Src.getOperand(2));
6874 Exec = AMDGPU::EXEC_LO;
6876 Exec = AMDGPU::EXEC;
6893 EVT VT =
N->getValueType(0);
6895 unsigned IID =
N->getConstantOperandVal(0);
6896 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6897 IID == Intrinsic::amdgcn_permlanex16;
6898 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6899 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6903 unsigned SplitSize = 32;
6904 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6905 ST->hasDPALU_DPP() &&
6913 case Intrinsic::amdgcn_permlane16:
6914 case Intrinsic::amdgcn_permlanex16:
6915 case Intrinsic::amdgcn_update_dpp:
6920 case Intrinsic::amdgcn_writelane:
6923 case Intrinsic::amdgcn_readlane:
6924 case Intrinsic::amdgcn_set_inactive:
6925 case Intrinsic::amdgcn_set_inactive_chain_arg:
6926 case Intrinsic::amdgcn_mov_dpp8:
6929 case Intrinsic::amdgcn_readfirstlane:
6930 case Intrinsic::amdgcn_permlane64:
6940 if (
SDNode *GL =
N->getGluedNode()) {
6941 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6942 GL = GL->getOperand(0).getNode();
6943 Operands.push_back(DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6952 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6953 IID == Intrinsic::amdgcn_mov_dpp8 ||
6954 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6955 Src1 =
N->getOperand(2);
6956 if (IID == Intrinsic::amdgcn_writelane ||
6957 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6958 Src2 =
N->getOperand(3);
6961 if (ValSize == SplitSize) {
6971 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6976 if (IID == Intrinsic::amdgcn_writelane) {
6981 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6983 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
6986 if (ValSize % SplitSize != 0)
6990 EVT VT =
N->getValueType(0);
6994 unsigned NumOperands =
N->getNumOperands();
6996 SDNode *GL =
N->getGluedNode();
7001 for (
unsigned i = 0; i != NE; ++i) {
7002 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7004 SDValue Operand =
N->getOperand(j);
7019 DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7034 if (SplitSize == 32) {
7036 return unrollLaneOp(LaneOp.
getNode());
7042 unsigned SubVecNumElt =
7046 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7047 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7051 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7056 if (IID == Intrinsic::amdgcn_writelane)
7061 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7062 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7063 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7064 EltIdx += SubVecNumElt;
7078 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7081 if (IID == Intrinsic::amdgcn_writelane)
7084 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7092 switch (
N->getOpcode()) {
7104 unsigned IID =
N->getConstantOperandVal(0);
7106 case Intrinsic::amdgcn_make_buffer_rsrc:
7107 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
7109 case Intrinsic::amdgcn_cvt_pkrtz: {
7115 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7118 case Intrinsic::amdgcn_cvt_pknorm_i16:
7119 case Intrinsic::amdgcn_cvt_pknorm_u16:
7120 case Intrinsic::amdgcn_cvt_pk_i16:
7121 case Intrinsic::amdgcn_cvt_pk_u16: {
7127 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7129 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7131 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7136 EVT VT =
N->getValueType(0);
7141 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7145 case Intrinsic::amdgcn_s_buffer_load: {
7151 if (!Subtarget->hasScalarSubwordLoads())
7157 EVT VT =
Op.getValueType();
7158 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
7170 if (!
Offset->isDivergent()) {
7189 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
7194 case Intrinsic::amdgcn_dead: {
7195 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7206 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7207 Results.push_back(Res.getOperand(
I));
7211 Results.push_back(Res.getValue(1));
7220 EVT VT =
N->getValueType(0);
7225 EVT SelectVT = NewVT;
7226 if (NewVT.
bitsLT(MVT::i32)) {
7229 SelectVT = MVT::i32;
7235 if (NewVT != SelectVT)
7241 if (
N->getValueType(0) != MVT::v2f16)
7245 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7253 if (
N->getValueType(0) != MVT::v2f16)
7257 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7265 if (
N->getValueType(0) != MVT::f16)
7280 if (U.get() !=
Value)
7283 if (U.getUser()->getOpcode() == Opcode)
7289unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
7292 case Intrinsic::amdgcn_if:
7294 case Intrinsic::amdgcn_else:
7296 case Intrinsic::amdgcn_loop:
7298 case Intrinsic::amdgcn_end_cf:
7318 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7345 SDNode *Intr = BRCOND.getOperand(1).getNode();
7358 assert(BR &&
"brcond missing unconditional branch user");
7362 unsigned CFNode = isCFIntrinsic(Intr);
7382 Ops.push_back(Target);
7405 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
7424 MVT VT =
Op.getSimpleValueType();
7427 if (
Op.getConstantOperandVal(0) != 0)
7431 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7433 if (
Info->isEntryFunction())
7450 return Op.getValueType().bitsLE(VT)
7458 EVT DstVT =
Op.getValueType();
7465 unsigned Opc =
Op.getOpcode();
7477 EVT SrcVT = Src.getValueType();
7478 EVT DstVT =
Op.getValueType();
7481 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
7484 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
7491 if (DstVT == MVT::f16) {
7496 if (!Subtarget->has16BitInsts()) {
7499 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7501 if (
Op->getFlags().hasApproximateFuncs()) {
7508 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7512 "custom lower FP_ROUND for f16 or bf16");
7513 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
7526 EVT VT =
Op.getValueType();
7528 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7529 bool IsIEEEMode =
Info->getMode().IEEE;
7538 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7545SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
7547 EVT VT =
Op.getValueType();
7549 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7550 bool IsIEEEMode =
Info->getMode().IEEE;
7555 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7563 EVT VT =
Op.getValueType();
7567 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7568 !Subtarget->hasMinimum3Maximum3F16() &&
7569 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7570 "should not need to widen f16 minimum/maximum to v2f16");
7584 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7592 EVT VT =
Op.getValueType();
7596 EVT ExpVT =
Exp.getValueType();
7597 if (ExpVT == MVT::i16)
7618 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
7621 return DAG.
getNode(ISD::FLDEXP,
DL, VT,
Op.getOperand(0), TruncExp);
7625 switch (
Op->getOpcode()) {
7655 DAGCombinerInfo &DCI)
const {
7656 const unsigned Opc =
Op.getOpcode();
7664 :
Op->getOperand(0).getValueType();
7667 if (DCI.isBeforeLegalizeOps() ||
7671 auto &DAG = DCI.DAG;
7677 LHS =
Op->getOperand(1);
7678 RHS =
Op->getOperand(2);
7680 LHS =
Op->getOperand(0);
7681 RHS =
Op->getOperand(1);
7720 if (MagVT == SignVT)
7727 SDValue SignAsInt32 = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7730 SDValue SignAsHalf16 = DAG.
getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7737 EVT VT =
Op.getValueType();
7743 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
7770 if (
Op->isDivergent())
7783 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7785 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7788 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7790 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7796 EVT VT =
Op.getValueType();
7803 const APInt &
C = RHSC->getAPIntValue();
7805 if (
C.isPowerOf2()) {
7807 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
7834 if (
Op->isDivergent()) {
7838 if (Subtarget->hasSMulHi()) {
7849 if (!Subtarget->isTrapHandlerEnabled() ||
7851 return lowerTrapEndpgm(
Op, DAG);
7853 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
7854 : lowerTrapHsaQueuePtr(
Op, DAG);
7864SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
7866 ImplicitParameter Param)
const {
7886 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
7889 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7892 if (UserSGPR == AMDGPU::NoRegister) {
7918 if (Subtarget->hasPrivEnabledTrap2NopBug())
7931 if (!Subtarget->isTrapHandlerEnabled() ||
7935 "debugtrap handler not supported",
7946SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
7948 if (Subtarget->hasApertureRegs()) {
7950 ? AMDGPU::SRC_SHARED_BASE
7951 : AMDGPU::SRC_PRIVATE_BASE;
7952 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
7953 !Subtarget->hasGloballyAddressableScratch()) &&
7954 "Cannot use src_private_base with globally addressable scratch!");
7977 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7986 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
7990 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7992 if (UserSGPR == AMDGPU::NoRegister) {
8026 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8037 const AMDGPUTargetMachine &TM =
8040 unsigned DestAS, SrcAS;
8042 bool IsNonNull =
false;
8044 SrcAS = ASC->getSrcAddressSpace();
8045 Src = ASC->getOperand(0);
8046 DestAS = ASC->getDestAddressSpace();
8049 Op.getConstantOperandVal(0) ==
8050 Intrinsic::amdgcn_addrspacecast_nonnull);
8051 Src =
Op->getOperand(1);
8052 SrcAS =
Op->getConstantOperandVal(2);
8053 DestAS =
Op->getConstantOperandVal(3);
8066 Subtarget->hasGloballyAddressableScratch()) {
8071 AMDGPU::S_MOV_B32, SL, MVT::i32,
8072 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8080 unsigned NullVal = TM.getNullPointerValue(DestAS);
8095 Subtarget->hasGloballyAddressableScratch()) {
8104 if (Subtarget->isWave64())
8110 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8113 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8118 AMDGPU::S_MOV_B64, SL, MVT::i64,
8119 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8121 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8123 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8125 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8131 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8143 Op.getValueType() == MVT::i64) {
8144 const SIMachineFunctionInfo *
Info =
8148 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8152 Src.getValueType() == MVT::i64)
8172 EVT InsVT =
Ins.getValueType();
8180 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
8185 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8187 MVT::i32, InsNumElts / 2);
8189 Vec = DAG.
getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8190 Ins = DAG.
getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8192 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
8194 if (InsNumElts == 2) {
8204 return DAG.
getNode(ISD::BITCAST, SL, VecVT, Vec);
8207 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
8230 if (NumElts == 4 && EltSize == 16 && KIdx) {
8238 SDValue LoVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8239 SDValue HiVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8241 unsigned Idx = KIdx->getZExtValue();
8242 bool InsertLo = Idx < 2;
8245 DAG.
getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8246 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8248 InsHalf = DAG.
getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8252 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8265 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
8293 return DAG.
getNode(ISD::BITCAST, SL, VecVT, BFI);
8300 EVT ResultVT =
Op.getValueType();
8313 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8316 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8320 if (VecSize == 128) {
8328 }
else if (VecSize == 256) {
8331 for (
unsigned P = 0;
P < 4; ++
P) {
8337 Parts[0], Parts[1]));
8339 Parts[2], Parts[3]));
8345 for (
unsigned P = 0;
P < 8; ++
P) {
8352 Parts[0], Parts[1], Parts[2], Parts[3]));
8355 Parts[4], Parts[5], Parts[6], Parts[7]));
8375 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8390 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8392 return DAG.
getNode(ISD::BITCAST, SL, ResultVT, Result);
8400 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8405 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8406 !(Mask[Elt + 1] & 1);
8412 EVT ResultVT =
Op.getValueType();
8415 const int NewSrcNumElts = 2;
8417 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
8433 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
8455 if (ShouldUseConsecutiveExtract &&
8458 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8459 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8471 if (Idx0 >= SrcNumElts) {
8476 if (Idx1 >= SrcNumElts) {
8481 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8482 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8490 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8491 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8496 if (SubVec0 != SubVec1) {
8497 NewMaskIdx1 += NewSrcNumElts;
8504 {NewMaskIdx0, NewMaskIdx1});
8509 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8510 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8511 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8512 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8531 EVT ResultVT =
Op.getValueType();
8547 EVT VT =
Op.getValueType();
8549 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8550 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
8559 return DAG.
getNode(ISD::BITCAST, SL, VT, ExtLo);
8568 return DAG.
getNode(ISD::BITCAST, SL, VT, ShlHi);
8575 return DAG.
getNode(ISD::BITCAST, SL, VT,
Or);
8584 for (
unsigned P = 0;
P < NumParts; ++
P) {
8586 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
8592 return DAG.
getNode(ISD::BITCAST, SL, VT, Blend);
8605 if (!Subtarget->isAmdHsaOS())
8665 EVT PtrVT =
Op.getValueType();
8667 const GlobalValue *GV = GSD->
getGlobal();
8681 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
8699 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8700 if (Subtarget->has64BitLiterals()) {
8731 MachinePointerInfo PtrInfo =
8759 SDValue Param = lowerKernargMemParameter(
8770 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
8778 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
8786 unsigned NumElts = Elts.
size();
8788 if (NumElts <= 12) {
8797 for (
unsigned i = 0; i < Elts.
size(); ++i) {
8803 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
8813 EVT SrcVT = Src.getValueType();
8834 bool Unpacked,
bool IsD16,
int DMaskPop,
8835 int NumVDataDwords,
bool IsAtomicPacked16Bit,
8839 EVT ReqRetVT = ResultTypes[0];
8841 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
8842 ? (ReqRetNumElts + 1) / 2
8845 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
8856 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
8867 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
8869 NumDataDwords - MaskPopDwords);
8874 EVT LegalReqRetVT = ReqRetVT;
8876 if (!
Data.getValueType().isInteger())
8878 Data.getValueType().changeTypeToInteger(),
Data);
8899 if (Result->getNumValues() == 1)
8906 SDValue *LWE,
bool &IsTexFail) {
8926 unsigned DimIdx,
unsigned EndIdx,
8927 unsigned NumGradients) {
8929 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
8937 if (((
I + 1) >= EndIdx) ||
8938 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
8939 I == DimIdx + NumGradients - 1))) {
8958 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8972 int NumVDataDwords = 0;
8973 bool AdjustRetType =
false;
8974 bool IsAtomicPacked16Bit =
false;
8977 const unsigned ArgOffset = WithChain ? 2 : 1;
8980 unsigned DMaskLanes = 0;
8982 if (BaseOpcode->Atomic) {
8983 VData =
Op.getOperand(2);
8985 IsAtomicPacked16Bit =
8986 (Intr->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8987 Intr->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8990 if (BaseOpcode->AtomicX2) {
8997 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8998 DMask = Is64Bit ? 0xf : 0x3;
8999 NumVDataDwords = Is64Bit ? 4 : 2;
9001 DMask = Is64Bit ? 0x3 : 0x1;
9002 NumVDataDwords = Is64Bit ? 2 : 1;
9005 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
9008 if (BaseOpcode->Store) {
9009 VData =
Op.getOperand(2);
9013 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9017 VData = handleD16VData(VData, DAG,
true);
9020 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
9021 }
else if (!BaseOpcode->NoReturn) {
9026 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9034 (!LoadVT.
isVector() && DMaskLanes > 1))
9040 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9041 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9042 NumVDataDwords = (DMaskLanes + 1) / 2;
9044 NumVDataDwords = DMaskLanes;
9046 AdjustRetType =
true;
9050 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
9057 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9058 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9060 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
9062 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9063 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9067 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
9073 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
9077 "Bias needs to be converted to 16 bit in A16 mode");
9082 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
9086 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
9087 "require 16 bit args for both gradients and addresses");
9092 if (!
ST->hasA16()) {
9093 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
9094 "support 16 bit addresses\n");
9104 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
9106 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9108 IntrOpcode = G16MappingInfo->
G16;
9131 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
9149 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
9150 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
9151 const bool UseNSA =
ST->hasNSAEncoding() &&
9152 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
9153 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
9154 const bool UsePartialNSA =
9155 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
9158 if (UsePartialNSA) {
9160 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9161 }
else if (!UseNSA) {
9168 if (!BaseOpcode->Sampler) {
9171 uint64_t UnormConst =
9172 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
9174 Unorm = UnormConst ? True : False;
9180 bool IsTexFail =
false;
9181 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9192 NumVDataDwords += 1;
9193 AdjustRetType =
true;
9198 if (AdjustRetType) {
9201 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9210 MVT::i32, NumVDataDwords)
9213 ResultTypes[0] = NewVT;
9214 if (ResultTypes.size() == 3) {
9218 ResultTypes.erase(&ResultTypes[1]);
9223 if (BaseOpcode->Atomic)
9230 if (BaseOpcode->Store || BaseOpcode->Atomic)
9231 Ops.push_back(VData);
9232 if (UsePartialNSA) {
9234 Ops.push_back(VAddr);
9238 Ops.push_back(VAddr);
9241 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9243 Ops.push_back(Rsrc);
9244 if (BaseOpcode->Sampler) {
9248 Ops.push_back(Samp);
9253 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9254 Ops.push_back(Unorm);
9256 Ops.push_back(IsA16 &&
9257 ST->hasFeature(AMDGPU::FeatureR128A16)
9261 Ops.push_back(IsA16 ? True : False);
9263 if (!Subtarget->hasGFX90AInsts())
9268 "TFE is not supported on this GPU",
DL.getDebugLoc()));
9271 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9274 Ops.push_back(DimInfo->
DA ? True : False);
9275 if (BaseOpcode->HasD16)
9276 Ops.push_back(IsD16 ? True : False);
9278 Ops.push_back(
Op.getOperand(0));
9280 int NumVAddrDwords =
9286 NumVDataDwords, NumVAddrDwords);
9287 }
else if (IsGFX11Plus) {
9289 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9290 : AMDGPU::MIMGEncGfx11Default,
9291 NumVDataDwords, NumVAddrDwords);
9292 }
else if (IsGFX10Plus) {
9294 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9295 : AMDGPU::MIMGEncGfx10Default,
9296 NumVDataDwords, NumVAddrDwords);
9298 if (Subtarget->hasGFX90AInsts()) {
9300 NumVDataDwords, NumVAddrDwords);
9304 "requested image instruction is not supported on this GPU",
9309 for (EVT VT : OrigResultTypes) {
9310 if (VT == MVT::Other)
9311 RetValues[Idx++] =
Op.getOperand(0);
9322 NumVDataDwords, NumVAddrDwords);
9325 NumVDataDwords, NumVAddrDwords);
9332 MachineMemOperand *MemRef = MemOp->getMemOperand();
9336 if (BaseOpcode->AtomicX2) {
9341 if (BaseOpcode->NoReturn)
9344 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9345 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9358 MachinePointerInfo(),
9363 if (!
Offset->isDivergent()) {
9370 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9379 !Subtarget->hasScalarDwordx3Loads()) {
9406 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9408 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
9412 unsigned NumLoads = 1;
9418 if (NumElts == 8 || NumElts == 16) {
9419 NumLoads = NumElts / 4;
9423 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
9428 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
9430 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
9431 for (
unsigned i = 0; i < NumLoads; ++i) {
9437 if (NumElts == 8 || NumElts == 16)
9445 if (!Subtarget->hasArchitectedSGPRs())
9487 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
9489 EVT VT =
Op.getValueType();
9491 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
9495 switch (IntrinsicID) {
9496 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9499 return getPreloadedValue(DAG, *MFI, VT,
9502 case Intrinsic::amdgcn_dispatch_ptr:
9503 case Intrinsic::amdgcn_queue_ptr: {
9504 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
9506 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
9511 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9514 return getPreloadedValue(DAG, *MFI, VT, RegID);
9516 case Intrinsic::amdgcn_implicitarg_ptr: {
9518 return getImplicitArgPtr(DAG,
DL);
9519 return getPreloadedValue(DAG, *MFI, VT,
9522 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9528 return getPreloadedValue(DAG, *MFI, VT,
9531 case Intrinsic::amdgcn_dispatch_id: {
9534 case Intrinsic::amdgcn_rcp:
9536 case Intrinsic::amdgcn_rsq:
9538 case Intrinsic::amdgcn_rsq_legacy:
9542 case Intrinsic::amdgcn_rcp_legacy:
9546 case Intrinsic::amdgcn_rsq_clamp: {
9557 return DAG.
getNode(ISD::FMAXNUM,
DL, VT, Tmp,
9560 case Intrinsic::r600_read_ngroups_x:
9561 if (Subtarget->isAmdHsaOS())
9564 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9567 case Intrinsic::r600_read_ngroups_y:
9568 if (Subtarget->isAmdHsaOS())
9571 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9574 case Intrinsic::r600_read_ngroups_z:
9575 if (Subtarget->isAmdHsaOS())
9578 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9581 case Intrinsic::r600_read_local_size_x:
9582 if (Subtarget->isAmdHsaOS())
9585 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9587 case Intrinsic::r600_read_local_size_y:
9588 if (Subtarget->isAmdHsaOS())
9591 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9593 case Intrinsic::r600_read_local_size_z:
9594 if (Subtarget->isAmdHsaOS())
9597 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9599 case Intrinsic::amdgcn_workgroup_id_x:
9600 return getPreloadedValue(DAG, *MFI, VT,
9602 case Intrinsic::amdgcn_workgroup_id_y:
9603 return getPreloadedValue(DAG, *MFI, VT,
9605 case Intrinsic::amdgcn_workgroup_id_z:
9606 return getPreloadedValue(DAG, *MFI, VT,
9608 case Intrinsic::amdgcn_wave_id:
9609 return lowerWaveID(DAG,
Op);
9610 case Intrinsic::amdgcn_lds_kernel_id: {
9612 return getLDSKernelId(DAG,
DL);
9613 return getPreloadedValue(DAG, *MFI, VT,
9616 case Intrinsic::amdgcn_workitem_id_x:
9617 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
9618 case Intrinsic::amdgcn_workitem_id_y:
9619 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
9620 case Intrinsic::amdgcn_workitem_id_z:
9621 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
9622 case Intrinsic::amdgcn_wavefrontsize:
9624 SDLoc(
Op), MVT::i32);
9625 case Intrinsic::amdgcn_s_buffer_load: {
9626 unsigned CPol =
Op.getConstantOperandVal(3);
9633 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
9634 Op.getOperand(3), DAG);
9636 case Intrinsic::amdgcn_fdiv_fast:
9637 return lowerFDIV_FAST(
Op, DAG);
9638 case Intrinsic::amdgcn_sin:
9641 case Intrinsic::amdgcn_cos:
9644 case Intrinsic::amdgcn_mul_u24:
9647 case Intrinsic::amdgcn_mul_i24:
9651 case Intrinsic::amdgcn_log_clamp: {
9657 case Intrinsic::amdgcn_fract:
9660 case Intrinsic::amdgcn_class:
9663 case Intrinsic::amdgcn_div_fmas:
9665 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9667 case Intrinsic::amdgcn_div_fixup:
9669 Op.getOperand(2),
Op.getOperand(3));
9671 case Intrinsic::amdgcn_div_scale: {
9684 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
9687 Denominator, Numerator);
9689 case Intrinsic::amdgcn_icmp: {
9691 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
9692 Op.getConstantOperandVal(2) == 0 &&
9697 case Intrinsic::amdgcn_fcmp: {
9700 case Intrinsic::amdgcn_ballot:
9702 case Intrinsic::amdgcn_fmed3:
9704 Op.getOperand(2),
Op.getOperand(3));
9705 case Intrinsic::amdgcn_fdot2:
9707 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9708 case Intrinsic::amdgcn_fmul_legacy:
9711 case Intrinsic::amdgcn_sffbh:
9713 case Intrinsic::amdgcn_sbfe:
9715 Op.getOperand(2),
Op.getOperand(3));
9716 case Intrinsic::amdgcn_ubfe:
9718 Op.getOperand(2),
Op.getOperand(3));
9719 case Intrinsic::amdgcn_cvt_pkrtz:
9720 case Intrinsic::amdgcn_cvt_pknorm_i16:
9721 case Intrinsic::amdgcn_cvt_pknorm_u16:
9722 case Intrinsic::amdgcn_cvt_pk_i16:
9723 case Intrinsic::amdgcn_cvt_pk_u16: {
9725 EVT VT =
Op.getValueType();
9728 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9730 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9732 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9734 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
9740 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
9743 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
9744 return DAG.
getNode(ISD::BITCAST,
DL, VT, Node);
9746 case Intrinsic::amdgcn_fmad_ftz:
9748 Op.getOperand(2),
Op.getOperand(3));
9750 case Intrinsic::amdgcn_if_break:
9752 Op->getOperand(1),
Op->getOperand(2)),
9755 case Intrinsic::amdgcn_groupstaticsize: {
9761 const GlobalValue *GV =
9767 case Intrinsic::amdgcn_is_shared:
9768 case Intrinsic::amdgcn_is_private: {
9771 DAG.
getNode(ISD::BITCAST,
DL, MVT::v2i32,
Op.getOperand(1));
9775 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9779 Subtarget->hasGloballyAddressableScratch()) {
9782 AMDGPU::S_MOV_B32,
DL, MVT::i32,
9783 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
9792 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
9795 case Intrinsic::amdgcn_perm:
9797 Op.getOperand(2),
Op.getOperand(3));
9798 case Intrinsic::amdgcn_reloc_constant: {
9808 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
9809 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
9810 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
9811 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
9812 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
9813 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
9814 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
9815 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
9816 if (
Op.getOperand(4).getValueType() == MVT::i32)
9822 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
9823 Op.getOperand(3), IndexKeyi32);
9825 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
9826 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
9827 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
9828 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
9829 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
9830 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
9831 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
9832 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
9833 if (
Op.getOperand(4).getValueType() == MVT::i64)
9839 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9840 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
9843 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
9844 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
9845 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
9846 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
9847 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
9848 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
9849 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
9852 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
9858 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9859 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9860 IndexKey, Op.getOperand(7),
9863 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
9864 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
9865 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
9866 if (
Op.getOperand(6).getValueType() == MVT::i32)
9872 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9873 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9874 IndexKeyi32, Op.getOperand(7)});
9876 case Intrinsic::amdgcn_addrspacecast_nonnull:
9877 return lowerADDRSPACECAST(
Op, DAG);
9878 case Intrinsic::amdgcn_readlane:
9879 case Intrinsic::amdgcn_readfirstlane:
9880 case Intrinsic::amdgcn_writelane:
9881 case Intrinsic::amdgcn_permlane16:
9882 case Intrinsic::amdgcn_permlanex16:
9883 case Intrinsic::amdgcn_permlane64:
9884 case Intrinsic::amdgcn_set_inactive:
9885 case Intrinsic::amdgcn_set_inactive_chain_arg:
9886 case Intrinsic::amdgcn_mov_dpp8:
9887 case Intrinsic::amdgcn_update_dpp:
9889 case Intrinsic::amdgcn_dead: {
9891 for (
const EVT ValTy :
Op.getNode()->values())
9896 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9898 return lowerImage(
Op, ImageDimIntr, DAG,
false);
9909 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
9915 unsigned NewOpcode)
const {
9919 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9920 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9938 M->getMemOperand());
9943 unsigned NewOpcode)
const {
9947 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9948 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9966 M->getMemOperand());
9971 unsigned IntrID =
Op.getConstantOperandVal(1);
9975 case Intrinsic::amdgcn_ds_ordered_add:
9976 case Intrinsic::amdgcn_ds_ordered_swap: {
9981 unsigned IndexOperand =
M->getConstantOperandVal(7);
9982 unsigned WaveRelease =
M->getConstantOperandVal(8);
9983 unsigned WaveDone =
M->getConstantOperandVal(9);
9985 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9986 IndexOperand &= ~0x3f;
9987 unsigned CountDw = 0;
9990 CountDw = (IndexOperand >> 24) & 0xf;
9991 IndexOperand &= ~(0xf << 24);
9993 if (CountDw < 1 || CountDw > 4) {
9996 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
10002 if (IndexOperand) {
10005 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
10008 if (WaveDone && !WaveRelease) {
10012 Fn,
"ds_ordered_count: wave_done requires wave_release",
10013 DL.getDebugLoc()));
10016 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10017 unsigned ShaderType =
10019 unsigned Offset0 = OrderedCountIndex << 2;
10020 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10023 Offset1 |= (CountDw - 1) << 6;
10026 Offset1 |= ShaderType << 2;
10028 unsigned Offset = Offset0 | (Offset1 << 8);
10035 M->getVTList(),
Ops,
M->getMemoryVT(),
10036 M->getMemOperand());
10038 case Intrinsic::amdgcn_raw_buffer_load:
10039 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10040 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10041 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10042 case Intrinsic::amdgcn_raw_buffer_load_format:
10043 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10044 const bool IsFormat =
10045 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10046 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10048 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10049 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10063 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
10065 case Intrinsic::amdgcn_struct_buffer_load:
10066 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10067 case Intrinsic::amdgcn_struct_buffer_load_format:
10068 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10069 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10070 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10071 const bool IsFormat =
10072 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10073 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10075 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10076 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10091 case Intrinsic::amdgcn_raw_tbuffer_load:
10092 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10094 EVT LoadVT =
Op.getValueType();
10095 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10096 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10115 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10118 case Intrinsic::amdgcn_struct_tbuffer_load:
10119 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10121 EVT LoadVT =
Op.getValueType();
10122 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10123 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10142 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10145 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10146 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10148 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10149 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10150 return lowerStructBufferAtomicIntrin(
Op, DAG,
10152 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10153 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10155 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10156 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10157 return lowerStructBufferAtomicIntrin(
Op, DAG,
10159 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10160 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10162 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10163 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10164 return lowerStructBufferAtomicIntrin(
Op, DAG,
10166 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10167 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10169 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10170 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10172 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10173 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10175 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10176 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10178 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10179 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10181 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10182 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10184 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10185 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10187 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10188 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10190 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10191 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10193 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10194 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10196 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10197 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10199 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10200 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10202 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10203 return lowerRawBufferAtomicIntrin(
Op, DAG,
10205 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10206 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10207 return lowerStructBufferAtomicIntrin(
Op, DAG,
10209 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10210 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10212 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10213 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10215 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10216 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10217 return lowerStructBufferAtomicIntrin(
Op, DAG,
10219 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10220 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10221 return lowerStructBufferAtomicIntrin(
Op, DAG,
10223 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10224 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10225 return lowerStructBufferAtomicIntrin(
Op, DAG,
10227 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10228 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10229 return lowerStructBufferAtomicIntrin(
Op, DAG,
10231 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10232 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10234 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10235 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10237 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10238 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10240 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10241 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10243 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10244 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10246 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10247 return lowerStructBufferAtomicIntrin(
Op, DAG,
10250 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10251 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10252 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
10253 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10267 EVT VT =
Op.getValueType();
10271 Op->getVTList(),
Ops, VT,
10272 M->getMemOperand());
10274 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10275 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10276 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
10277 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
10291 EVT VT =
Op.getValueType();
10295 Op->getVTList(),
Ops, VT,
10296 M->getMemOperand());
10298 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10299 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10301 SDValue NodePtr =
M->getOperand(2);
10302 SDValue RayExtent =
M->getOperand(3);
10303 SDValue InstanceMask =
M->getOperand(4);
10304 SDValue RayOrigin =
M->getOperand(5);
10305 SDValue RayDir =
M->getOperand(6);
10307 SDValue TDescr =
M->getOperand(8);
10312 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10317 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10318 const unsigned NumVDataDwords = 10;
10319 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10321 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10322 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10323 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10327 Ops.push_back(NodePtr);
10330 {DAG.getBitcast(MVT::i32, RayExtent),
10331 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10332 Ops.push_back(RayOrigin);
10333 Ops.push_back(RayDir);
10334 Ops.push_back(Offsets);
10335 Ops.push_back(TDescr);
10336 Ops.push_back(
M->getChain());
10339 MachineMemOperand *MemRef =
M->getMemOperand();
10343 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10345 SDValue NodePtr =
M->getOperand(2);
10346 SDValue RayExtent =
M->getOperand(3);
10347 SDValue RayOrigin =
M->getOperand(4);
10348 SDValue RayDir =
M->getOperand(5);
10349 SDValue RayInvDir =
M->getOperand(6);
10350 SDValue TDescr =
M->getOperand(7);
10357 if (!Subtarget->hasGFX10_AEncoding()) {
10367 const unsigned NumVDataDwords = 4;
10368 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10369 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10370 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10373 const unsigned BaseOpcodes[2][2] = {
10374 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10375 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10376 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10380 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10381 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10382 : AMDGPU::MIMGEncGfx10NSA,
10383 NumVDataDwords, NumVAddrDwords);
10387 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10388 : AMDGPU::MIMGEncGfx10Default,
10389 NumVDataDwords, NumVAddrDwords);
10395 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
10398 if (Lanes[0].getValueSizeInBits() == 32) {
10399 for (
unsigned I = 0;
I < 3; ++
I)
10406 Ops.push_back(Lanes[2]);
10418 if (UseNSA && IsGFX11Plus) {
10419 Ops.push_back(NodePtr);
10421 Ops.push_back(RayOrigin);
10426 for (
unsigned I = 0;
I < 3; ++
I) {
10429 {DirLanes[I], InvDirLanes[I]})));
10433 Ops.push_back(RayDir);
10434 Ops.push_back(RayInvDir);
10441 Ops.push_back(NodePtr);
10444 packLanes(RayOrigin,
true);
10445 packLanes(RayDir,
true);
10446 packLanes(RayInvDir,
false);
10451 if (NumVAddrDwords > 12) {
10453 Ops.append(16 -
Ops.size(), Undef);
10459 Ops.push_back(MergedOps);
10462 Ops.push_back(TDescr);
10464 Ops.push_back(
M->getChain());
10467 MachineMemOperand *MemRef =
M->getMemOperand();
10471 case Intrinsic::amdgcn_global_atomic_fmin_num:
10472 case Intrinsic::amdgcn_global_atomic_fmax_num:
10473 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10474 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10481 unsigned Opcode = 0;
10483 case Intrinsic::amdgcn_global_atomic_fmin_num:
10484 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10485 Opcode = ISD::ATOMIC_LOAD_FMIN;
10488 case Intrinsic::amdgcn_global_atomic_fmax_num:
10489 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10490 Opcode = ISD::ATOMIC_LOAD_FMAX;
10496 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
10497 Ops,
M->getMemOperand());
10499 case Intrinsic::amdgcn_s_get_barrier_state:
10500 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10507 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10508 BarID = (BarID >> 4) & 0x3F;
10509 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10512 Ops.push_back(Chain);
10514 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10515 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10523 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
10531 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10532 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10533 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10537 EVT VT =
Op->getValueType(0);
10543 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10545 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10553SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
10560 EVT VT = VTList.
VTs[0];
10563 bool IsTFE = VTList.
NumVTs == 3;
10566 unsigned NumOpDWords = NumValueDWords + 1;
10568 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
10569 MachineMemOperand *OpDWordsMMO =
10571 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
10572 OpDWordsVT, OpDWordsMMO, DAG);
10577 NumValueDWords == 1
10586 if (!Subtarget->hasDwordx3LoadStores() &&
10587 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10591 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
10593 WidenedMemVT, WidenedMMO);
10603 bool ImageStore)
const {
10613 if (Subtarget->hasUnpackedD16VMem()) {
10627 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10638 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
10644 if ((NumElements % 2) == 1) {
10646 unsigned I = Elts.
size() / 2;
10662 if (NumElements == 3) {
10672 return DAG.
getNode(ISD::BITCAST,
DL, WidenedStoreVT, ZExt);
10683 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
10686 switch (IntrinsicID) {
10687 case Intrinsic::amdgcn_exp_compr: {
10688 if (!Subtarget->hasCompressedExport()) {
10691 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
10703 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src0),
10704 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src1),
10713 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10717 case Intrinsic::amdgcn_struct_tbuffer_store:
10718 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10720 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
10722 VData = handleD16VData(VData, DAG);
10723 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10724 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10742 M->getMemoryVT(),
M->getMemOperand());
10745 case Intrinsic::amdgcn_raw_tbuffer_store:
10746 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
10748 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
10750 VData = handleD16VData(VData, DAG);
10751 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10752 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10770 M->getMemoryVT(),
M->getMemOperand());
10773 case Intrinsic::amdgcn_raw_buffer_store:
10774 case Intrinsic::amdgcn_raw_ptr_buffer_store:
10775 case Intrinsic::amdgcn_raw_buffer_store_format:
10776 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
10777 const bool IsFormat =
10778 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
10779 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
10786 VData = handleD16VData(VData, DAG);
10796 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10797 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10817 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
10820 M->getMemoryVT(),
M->getMemOperand());
10823 case Intrinsic::amdgcn_struct_buffer_store:
10824 case Intrinsic::amdgcn_struct_ptr_buffer_store:
10825 case Intrinsic::amdgcn_struct_buffer_store_format:
10826 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
10827 const bool IsFormat =
10828 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
10829 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
10837 VData = handleD16VData(VData, DAG);
10847 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10848 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10867 EVT VDataType = VData.getValueType().getScalarType();
10869 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
10872 M->getMemoryVT(),
M->getMemOperand());
10874 case Intrinsic::amdgcn_raw_buffer_load_lds:
10875 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
10876 case Intrinsic::amdgcn_struct_buffer_load_lds:
10877 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
10878 if (!Subtarget->hasVMemToLDSLoad())
10882 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
10883 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
10884 unsigned OpOffset = HasVIndex ? 1 : 0;
10885 SDValue VOffset =
Op.getOperand(5 + OpOffset);
10887 unsigned Size =
Op->getConstantOperandVal(4);
10893 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
10894 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
10895 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
10896 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
10899 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
10900 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
10901 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
10902 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
10905 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
10906 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
10907 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
10908 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
10911 if (!Subtarget->hasLDSLoadB96_B128())
10913 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
10914 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
10915 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
10916 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
10919 if (!Subtarget->hasLDSLoadB96_B128())
10921 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
10922 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
10923 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
10924 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
10932 if (HasVIndex && HasVOffset)
10936 else if (HasVIndex)
10937 Ops.push_back(
Op.getOperand(5));
10938 else if (HasVOffset)
10939 Ops.push_back(VOffset);
10941 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10942 Ops.push_back(Rsrc);
10943 Ops.push_back(
Op.getOperand(6 + OpOffset));
10944 Ops.push_back(
Op.getOperand(7 + OpOffset));
10946 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
10959 MachineMemOperand *LoadMMO =
M->getMemOperand();
10964 MachinePointerInfo StorePtrI = LoadPtrI;
10988 case Intrinsic::amdgcn_load_to_lds:
10989 case Intrinsic::amdgcn_global_load_lds: {
10990 if (!Subtarget->hasVMemToLDSLoad())
10994 unsigned Size =
Op->getConstantOperandVal(4);
10999 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11002 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11005 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11008 if (!Subtarget->hasLDSLoadB96_B128())
11010 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11013 if (!Subtarget->hasLDSLoadB96_B128())
11015 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11031 if (
LHS->isDivergent())
11035 RHS.getOperand(0).getValueType() == MVT::i32) {
11038 VOffset =
RHS.getOperand(0);
11042 Ops.push_back(Addr);
11050 Ops.push_back(VOffset);
11053 Ops.push_back(
Op.getOperand(5));
11054 Ops.push_back(
Op.getOperand(6));
11059 MachineMemOperand *LoadMMO =
M->getMemOperand();
11061 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
11062 MachinePointerInfo StorePtrI = LoadPtrI;
11081 case Intrinsic::amdgcn_end_cf:
11083 Op->getOperand(2), Chain),
11085 case Intrinsic::amdgcn_s_barrier_init:
11086 case Intrinsic::amdgcn_s_barrier_signal_var: {
11093 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11094 ? AMDGPU::S_BARRIER_INIT_M0
11095 : AMDGPU::S_BARRIER_SIGNAL_M0;
11110 constexpr unsigned ShAmt = 16;
11117 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11122 case Intrinsic::amdgcn_s_barrier_join: {
11131 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11134 unsigned BarID = (BarVal >> 4) & 0x3F;
11137 Ops.push_back(Chain);
11139 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11149 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11155 case Intrinsic::amdgcn_s_prefetch_data: {
11158 return Op.getOperand(0);
11161 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11163 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
11170 Op->getVTList(),
Ops,
M->getMemoryVT(),
11171 M->getMemOperand());
11173 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11174 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11175 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11184 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11186 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11211std::pair<SDValue, SDValue>
11241 unsigned Overflow = ImmOffset & ~MaxImm;
11242 ImmOffset -= Overflow;
11243 if ((int32_t)Overflow < 0) {
11244 Overflow += ImmOffset;
11249 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
11268void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
11270 Align Alignment)
const {
11272 SDLoc
DL(CombinedOffset);
11274 uint32_t
Imm =
C->getZExtValue();
11275 uint32_t SOffset, ImmOffset;
11276 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11286 uint32_t SOffset, ImmOffset;
11289 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
11297 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11306SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
11309 return MaybePointer;
11323 SDValue NumRecords =
Op->getOperand(3);
11326 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11329 std::optional<uint32_t> ConstStride = std::nullopt;
11331 ConstStride = ConstNode->getZExtValue();
11334 if (!ConstStride || *ConstStride != 0) {
11337 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
11348 NewHighHalf, NumRecords, Flags);
11349 SDValue RsrcPtr = DAG.
getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11358 bool IsTFE)
const {
11367 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
11382 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
11386 LoadVal = DAG.
getNode(ISD::BITCAST,
DL, LoadVT, LoadVal);
11396 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11400 Ops[1] = BufferStoreExt;
11405 M->getMemOperand());
11430 DAGCombinerInfo &DCI)
const {
11431 SelectionDAG &DAG = DCI.DAG;
11446 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
11453 "unexpected vector extload");
11466 "unexpected fp extload");
11484 DCI.AddToWorklist(Cvt.
getNode());
11489 DCI.AddToWorklist(Cvt.
getNode());
11492 Cvt = DAG.
getNode(ISD::BITCAST, SL, VT, Cvt);
11500 if (
Info.isEntryFunction())
11501 return Info.getUserSGPRInfo().hasFlatScratchInit();
11509 EVT MemVT =
Load->getMemoryVT();
11510 MachineMemOperand *MMO =
Load->getMemOperand();
11522 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11550 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
11551 "Custom lowering for non-i32 vectors hasn't been implemented.");
11554 unsigned AS =
Load->getAddressSpace();
11561 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
11565 !Subtarget->hasMultiDwordFlatScratchAddressing())
11575 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
11578 Alignment >=
Align(4) && NumElements < 32) {
11580 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11592 if (NumElements > 4)
11595 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11605 switch (Subtarget->getMaxPrivateElementSize()) {
11611 if (NumElements > 2)
11616 if (NumElements > 4)
11619 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11628 auto Flags =
Load->getMemOperand()->getFlags();
11630 Load->getAlign(), Flags, &
Fast) &&
11639 MemVT, *
Load->getMemOperand())) {
11648 EVT VT =
Op.getValueType();
11675 return DAG.
getNode(ISD::BITCAST,
DL, VT, Res);
11685 EVT VT =
Op.getValueType();
11686 const SDNodeFlags
Flags =
Op->getFlags();
11688 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
11694 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11697 if (CLHS->isExactlyValue(1.0)) {
11714 if (CLHS->isExactlyValue(-1.0)) {
11723 if (!AllowInaccurateRcp &&
11724 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
11738 EVT VT =
Op.getValueType();
11739 const SDNodeFlags
Flags =
Op->getFlags();
11741 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
11742 if (!AllowInaccurateDiv)
11763 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
11777 return DAG.
getNode(Opcode, SL, VTList,
11786 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
11800 return DAG.
getNode(Opcode, SL, VTList,
11806 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
11807 return FastLowered;
11810 EVT VT =
Op.getValueType();
11817 if (VT == MVT::bf16) {
11840 unsigned FMADOpCode =
11842 SDValue NegRHSExt = DAG.
getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
11847 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11849 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
11850 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11856 Tmp = DAG.
getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
11866 SDNodeFlags
Flags =
Op->getFlags();
11873 const APFloat K0Val(0x1p+96f);
11876 const APFloat K1Val(0x1p-32f);
11903 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
11904 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
11905 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
11910 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
11911 return FastLowered;
11917 SDNodeFlags
Flags =
Op->getFlags();
11918 Flags.setNoFPExcept(
true);
11926 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
11937 DAG.
getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
11939 using namespace AMDGPU::Hwreg;
11940 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
11944 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
11945 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
11948 const bool HasDynamicDenormals =
11954 if (!PreservesDenormals) {
11959 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
11962 if (HasDynamicDenormals) {
11966 SavedDenormMode =
SDValue(GetReg, 0);
11972 SDNode *EnableDenorm;
11973 if (Subtarget->hasDenormModeInst()) {
11974 const SDValue EnableDenormValue =
11981 const SDValue EnableDenormValue =
11983 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
11984 {EnableDenormValue,
BitField, Glue});
11994 ApproxRcp, One, NegDivScale0, Flags);
11997 ApproxRcp, Fma0, Flags);
12003 NumeratorScaled,
Mul, Flags);
12009 NumeratorScaled, Fma3, Flags);
12011 if (!PreservesDenormals) {
12012 SDNode *DisableDenorm;
12013 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12017 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12023 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
12024 const SDValue DisableDenormValue =
12025 HasDynamicDenormals
12030 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12041 {Fma4, Fma1, Fma3, Scale},
Flags);
12047 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
12048 return FastLowered;
12056 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
12060 SDValue NegDivScale0 = DAG.
getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12080 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12089 SDValue Scale0BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12090 SDValue Scale1BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12116 EVT VT =
Op.getValueType();
12118 if (VT == MVT::f32)
12119 return LowerFDIV32(
Op, DAG);
12121 if (VT == MVT::f64)
12122 return LowerFDIV64(
Op, DAG);
12124 if (VT == MVT::f16 || VT == MVT::bf16)
12125 return LowerFDIV16(
Op, DAG);
12134 EVT ResultExpVT =
Op->getValueType(1);
12135 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12145 if (Subtarget->hasFractBug()) {
12163 EVT VT =
Store->getMemoryVT();
12165 if (VT == MVT::i1) {
12169 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
12173 Store->getValue().getValueType().getScalarType() == MVT::i32);
12175 unsigned AS =
Store->getAddressSpace();
12183 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12187 !Subtarget->hasMultiDwordFlatScratchAddressing())
12194 if (NumElements > 4)
12197 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12201 VT, *
Store->getMemOperand()))
12207 switch (Subtarget->getMaxPrivateElementSize()) {
12211 if (NumElements > 2)
12215 if (NumElements > 4 ||
12216 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12224 auto Flags =
Store->getMemOperand()->getFlags();
12243 assert(!Subtarget->has16BitInsts());
12244 SDNodeFlags
Flags =
Op->getFlags();
12246 DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32,
Op.getOperand(0), Flags);
12258 SDNodeFlags
Flags =
Op->getFlags();
12259 MVT VT =
Op.getValueType().getSimpleVT();
12289 SDValue SqrtSNextDown = DAG.
getNode(ISD::BITCAST,
DL, VT, SqrtSNextDownInt);
12292 DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextDown, Flags);
12301 SDValue NegSqrtSNextUp = DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextUp, Flags);
12367 SDNodeFlags
Flags =
Op->getFlags();
12413 SqrtRet = DAG.
getNode(ISD::FLDEXP,
DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12430 EVT VT =
Op.getValueType();
12440 if (Subtarget->hasTrigReducedRange()) {
12447 switch (
Op.getOpcode()) {
12474 EVT VT =
Op.getValueType();
12482 Op->getVTList(),
Ops, VT,
12491SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
12492 DAGCombinerInfo &DCI)
const {
12493 EVT VT =
N->getValueType(0);
12495 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12498 SelectionDAG &DAG = DCI.DAG;
12502 EVT SrcVT = Src.getValueType();
12508 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12511 DCI.AddToWorklist(Cvt.
getNode());
12514 if (ScalarVT != MVT::f32) {
12526 DAGCombinerInfo &DCI)
const {
12533 if (SignOp.
getOpcode() == ISD::FP_EXTEND ||
12537 SelectionDAG &DAG = DCI.DAG;
12556 for (
unsigned I = 0;
I != NumElts; ++
I) {
12580 if (NewElts.
size() == 1)
12602 for (
unsigned I = 0;
I != NumElts; ++
I) {
12637SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
12639 DAGCombinerInfo &DCI)
const {
12657 SelectionDAG &DAG = DCI.DAG;
12670 AM.BaseOffs =
Offset.getSExtValue();
12675 EVT VT =
N->getValueType(0);
12681 Flags.setNoUnsignedWrap(
12682 N->getFlags().hasNoUnsignedWrap() &&
12692 switch (
N->getOpcode()) {
12703 DAGCombinerInfo &DCI)
const {
12704 SelectionDAG &DAG = DCI.DAG;
12711 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
12712 N->getMemoryVT(), DCI);
12716 NewOps[PtrIdx] = NewPtr;
12725 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
12726 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
12735SDValue SITargetLowering::splitBinaryBitConstantOp(
12739 uint32_t ValLo =
Lo_32(Val);
12740 uint32_t ValHi =
Hi_32(Val);
12747 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
12761 if (V.getValueType() != MVT::i1)
12763 switch (V.getOpcode()) {
12780 return V.getResNo() == 1;
12782 unsigned IntrinsicID = V.getConstantOperandVal(0);
12783 switch (IntrinsicID) {
12784 case Intrinsic::amdgcn_is_shared:
12785 case Intrinsic::amdgcn_is_private:
12802 if (!(
C & 0x000000ff))
12803 ZeroByteMask |= 0x000000ff;
12804 if (!(
C & 0x0000ff00))
12805 ZeroByteMask |= 0x0000ff00;
12806 if (!(
C & 0x00ff0000))
12807 ZeroByteMask |= 0x00ff0000;
12808 if (!(
C & 0xff000000))
12809 ZeroByteMask |= 0xff000000;
12810 uint32_t NonZeroByteMask = ~ZeroByteMask;
12811 if ((NonZeroByteMask &
C) != NonZeroByteMask)
12824 assert(V.getValueSizeInBits() == 32);
12826 if (V.getNumOperands() != 2)
12835 switch (V.getOpcode()) {
12840 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
12845 return (0x03020100 & ~ConstMask) | ConstMask;
12852 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
12858 return uint32_t(0x0c0c0c0c03020100ull >>
C);
12865 DAGCombinerInfo &DCI)
const {
12866 if (DCI.isBeforeLegalize())
12869 SelectionDAG &DAG = DCI.DAG;
12870 EVT VT =
N->getValueType(0);
12875 if (VT == MVT::i64 && CRHS) {
12877 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
12881 if (CRHS && VT == MVT::i32) {
12891 unsigned Shift = CShift->getZExtValue();
12893 unsigned Offset = NB + Shift;
12894 if ((
Offset & (Bits - 1)) == 0) {
12918 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
12933 if (
Y.getOpcode() != ISD::FABS ||
Y.getOperand(0) !=
X ||
12938 if (
X !=
LHS.getOperand(1))
12942 const ConstantFPSDNode *C1 =
12976 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
12977 LHS.getOperand(0) ==
LHS.getOperand(1))) {
12979 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
12980 :
Mask->getZExtValue() & OrdMask;
13001 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13004 if (LHSMask != ~0u && RHSMask != ~0u) {
13007 if (LHSMask > RHSMask) {
13014 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13015 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13018 if (!(LHSUsedLanes & RHSUsedLanes) &&
13021 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13027 uint32_t
Mask = LHSMask & RHSMask;
13028 for (
unsigned I = 0;
I < 32;
I += 8) {
13029 uint32_t ByteSel = 0xff <<
I;
13030 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13031 Mask &= (0x0c <<
I) & 0xffffffff;
13036 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
13089static const std::optional<ByteProvider<SDValue>>
13091 unsigned Depth = 0) {
13094 return std::nullopt;
13096 if (
Op.getValueSizeInBits() < 8)
13097 return std::nullopt;
13099 if (
Op.getValueType().isVector())
13102 switch (
Op->getOpcode()) {
13114 NarrowVT = VTSign->getVT();
13117 return std::nullopt;
13120 if (SrcIndex >= NarrowByteWidth)
13121 return std::nullopt;
13129 return std::nullopt;
13131 uint64_t BitShift = ShiftOp->getZExtValue();
13133 if (BitShift % 8 != 0)
13134 return std::nullopt;
13136 SrcIndex += BitShift / 8;
13154static const std::optional<ByteProvider<SDValue>>
13156 unsigned StartingIndex = 0) {
13160 return std::nullopt;
13162 unsigned BitWidth =
Op.getScalarValueSizeInBits();
13164 return std::nullopt;
13166 return std::nullopt;
13168 bool IsVec =
Op.getValueType().isVector();
13169 switch (
Op.getOpcode()) {
13172 return std::nullopt;
13177 return std::nullopt;
13181 return std::nullopt;
13184 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
13185 return std::nullopt;
13186 if (!
LHS ||
LHS->isConstantZero())
13188 if (!
RHS ||
RHS->isConstantZero())
13190 return std::nullopt;
13195 return std::nullopt;
13199 return std::nullopt;
13201 uint32_t BitMask = BitMaskOp->getZExtValue();
13203 uint32_t IndexMask = 0xFF << (Index * 8);
13205 if ((IndexMask & BitMask) != IndexMask) {
13208 if (IndexMask & BitMask)
13209 return std::nullopt;
13218 return std::nullopt;
13222 if (!ShiftOp ||
Op.getValueType().isVector())
13223 return std::nullopt;
13225 uint64_t BitsProvided =
Op.getValueSizeInBits();
13226 if (BitsProvided % 8 != 0)
13227 return std::nullopt;
13229 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13231 return std::nullopt;
13233 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13234 uint64_t ByteShift = BitShift / 8;
13236 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13237 uint64_t BytesProvided = BitsProvided / 8;
13238 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13239 NewIndex %= BytesProvided;
13246 return std::nullopt;
13250 return std::nullopt;
13252 uint64_t BitShift = ShiftOp->getZExtValue();
13254 return std::nullopt;
13256 auto BitsProvided =
Op.getScalarValueSizeInBits();
13257 if (BitsProvided % 8 != 0)
13258 return std::nullopt;
13260 uint64_t BytesProvided = BitsProvided / 8;
13261 uint64_t ByteShift = BitShift / 8;
13266 return BytesProvided - ByteShift > Index
13274 return std::nullopt;
13278 return std::nullopt;
13280 uint64_t BitShift = ShiftOp->getZExtValue();
13281 if (BitShift % 8 != 0)
13282 return std::nullopt;
13283 uint64_t ByteShift = BitShift / 8;
13289 return Index < ByteShift
13292 Depth + 1, StartingIndex);
13301 return std::nullopt;
13309 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13311 if (NarrowBitWidth % 8 != 0)
13312 return std::nullopt;
13313 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13315 if (Index >= NarrowByteWidth)
13317 ? std::optional<ByteProvider<SDValue>>(
13325 return std::nullopt;
13329 if (NarrowByteWidth >= Index) {
13334 return std::nullopt;
13341 return std::nullopt;
13347 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13348 if (NarrowBitWidth % 8 != 0)
13349 return std::nullopt;
13350 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13355 if (Index >= NarrowByteWidth) {
13357 ? std::optional<ByteProvider<SDValue>>(
13362 if (NarrowByteWidth > Index) {
13366 return std::nullopt;
13371 return std::nullopt;
13374 Depth + 1, StartingIndex);
13380 return std::nullopt;
13381 auto VecIdx = IdxOp->getZExtValue();
13382 auto ScalarSize =
Op.getScalarValueSizeInBits();
13383 if (ScalarSize < 32)
13384 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13386 StartingIndex, Index);
13391 return std::nullopt;
13395 return std::nullopt;
13398 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13399 if (IdxMask > 0x07 && IdxMask != 0x0c)
13400 return std::nullopt;
13402 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13403 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13405 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
13411 return std::nullopt;
13426 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
13433 auto MemVT = L->getMemoryVT();
13436 return L->getMemoryVT().getSizeInBits() == 16;
13446 int Low8 = Mask & 0xff;
13447 int Hi8 = (Mask & 0xff00) >> 8;
13449 assert(Low8 < 8 && Hi8 < 8);
13451 bool IsConsecutive = (Hi8 - Low8 == 1);
13456 bool Is16Aligned = !(Low8 % 2);
13458 return IsConsecutive && Is16Aligned;
13466 int Low16 = PermMask & 0xffff;
13467 int Hi16 = (PermMask & 0xffff0000) >> 16;
13477 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13479 if (!OtherOpIs16Bit)
13487 unsigned DWordOffset) {
13492 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13497 if (Src.getValueType().isVector()) {
13498 auto ScalarTySize = Src.getScalarValueSizeInBits();
13499 auto ScalarTy = Src.getValueType().getScalarType();
13500 if (ScalarTySize == 32) {
13504 if (ScalarTySize > 32) {
13507 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13508 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13515 assert(ScalarTySize < 32);
13516 auto NumElements =
TypeSize / ScalarTySize;
13517 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13518 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13519 auto NumElementsIn32 = 32 / ScalarTySize;
13520 auto NumAvailElements = DWordOffset < Trunc32Elements
13522 : NumElements - NormalizedTrunc;
13535 auto ShiftVal = 32 * DWordOffset;
13543 [[maybe_unused]]
EVT VT =
N->getValueType(0);
13548 for (
int i = 0; i < 4; i++) {
13550 std::optional<ByteProvider<SDValue>>
P =
13553 if (!
P ||
P->isConstantZero())
13558 if (PermNodes.
size() != 4)
13561 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13562 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13564 for (
size_t i = 0; i < PermNodes.
size(); i++) {
13565 auto PermOp = PermNodes[i];
13568 int SrcByteAdjust = 4;
13572 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13573 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13575 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13576 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13580 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13581 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13584 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13586 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13589 SDValue Op = *PermNodes[FirstSrc.first].Src;
13591 assert(
Op.getValueSizeInBits() == 32);
13595 int Low16 = PermMask & 0xffff;
13596 int Hi16 = (PermMask & 0xffff0000) >> 16;
13598 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13599 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13602 if (WellFormedLow && WellFormedHi)
13606 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
13615 assert(
Op.getValueType().isByteSized() &&
13633 DAGCombinerInfo &DCI)
const {
13634 SelectionDAG &DAG = DCI.DAG;
13638 EVT VT =
N->getValueType(0);
13639 if (VT == MVT::i1) {
13644 if (Src !=
RHS.getOperand(0))
13649 if (!CLHS || !CRHS)
13653 static const uint32_t MaxMask = 0x3ff;
13673 Sel |=
LHS.getConstantOperandVal(2);
13682 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13686 auto usesCombinedOperand = [](SDNode *OrUse) {
13688 if (OrUse->getOpcode() != ISD::BITCAST ||
13689 !OrUse->getValueType(0).isVector())
13693 for (
auto *VUser : OrUse->users()) {
13694 if (!VUser->getValueType(0).isVector())
13701 if (VUser->getOpcode() == VectorwiseOp)
13707 if (!
any_of(
N->users(), usesCombinedOperand))
13713 if (LHSMask != ~0u && RHSMask != ~0u) {
13716 if (LHSMask > RHSMask) {
13723 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13724 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13727 if (!(LHSUsedLanes & RHSUsedLanes) &&
13730 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13732 LHSMask &= ~RHSUsedLanes;
13733 RHSMask &= ~LHSUsedLanes;
13735 LHSMask |= LHSUsedLanes & 0x04040404;
13737 uint32_t Sel = LHSMask | RHSMask;
13745 if (LHSMask == ~0u || RHSMask == ~0u) {
13751 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
13766 if (SrcVT == MVT::i32) {
13771 DCI.AddToWorklist(LowOr.
getNode());
13772 DCI.AddToWorklist(HiBits.getNode());
13776 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
13783 N->getOperand(0), CRHS))
13791 DAGCombinerInfo &DCI)
const {
13792 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
13799 SelectionDAG &DAG = DCI.DAG;
13801 EVT VT =
N->getValueType(0);
13802 if (CRHS && VT == MVT::i64) {
13804 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
13818 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(1));
13820 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(2));
13824 LHS->getOperand(0), FNegLHS, FNegRHS);
13825 return DAG.
getNode(ISD::BITCAST,
DL, VT, NewSelect);
13833 DAGCombinerInfo &DCI)
const {
13834 if (!Subtarget->has16BitInsts() ||
13838 EVT VT =
N->getValueType(0);
13839 if (VT != MVT::i32)
13843 if (Src.getValueType() != MVT::i16)
13850SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
13851 DAGCombinerInfo &DCI)
const {
13858 VTSign->getVT() == MVT::i8) ||
13860 VTSign->getVT() == MVT::i16))) {
13861 assert(Subtarget->hasScalarSubwordLoads() &&
13862 "s_buffer_load_{u8, i8} are supported "
13863 "in GFX12 (or newer) architectures.");
13864 EVT VT = Src.getValueType();
13869 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
13876 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
13877 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
13882 VTSign->getVT() == MVT::i8) ||
13884 VTSign->getVT() == MVT::i16)) &&
13893 Src.getOperand(6), Src.getOperand(7)};
13896 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
13900 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
13901 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
13902 return DCI.DAG.getMergeValues(
13903 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
13909 DAGCombinerInfo &DCI)
const {
13910 SelectionDAG &DAG = DCI.DAG;
13917 if (
N->getOperand(0).isUndef())
13924 DAGCombinerInfo &DCI)
const {
13925 EVT VT =
N->getValueType(0);
13940 if ((VT == MVT::f16 && N0.
getOpcode() == ISD::FSQRT) &&
13950 unsigned MaxDepth)
const {
13951 unsigned Opcode =
Op.getOpcode();
13956 const auto &
F = CFP->getValueAPF();
13957 if (
F.isNaN() &&
F.isSignaling())
13959 if (!
F.isDenormal())
13985 case ISD::FP_EXTEND:
13986 case ISD::FP16_TO_FP:
13987 case ISD::FP_TO_FP16:
13988 case ISD::BF16_TO_FP:
13989 case ISD::FP_TO_BF16:
14022 if (
Op.getValueType() == MVT::i32) {
14028 if (RHS->getZExtValue() == 0xffff0000) {
14038 return Op.getValueType().getScalarType() != MVT::f16;
14042 case ISD::FMINNUM_IEEE:
14043 case ISD::FMAXNUM_IEEE:
14044 case ISD::FMINIMUM:
14045 case ISD::FMAXIMUM:
14046 case ISD::FMINIMUMNUM:
14047 case ISD::FMAXIMUMNUM:
14059 if (Subtarget->supportsMinMaxDenormModes() ||
14069 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
14081 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
14108 if (
Op.getValueType() == MVT::i16) {
14111 TruncSrc.
getOpcode() == ISD::BITCAST &&
14119 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
14121 switch (IntrinsicID) {
14122 case Intrinsic::amdgcn_cvt_pkrtz:
14123 case Intrinsic::amdgcn_cubeid:
14124 case Intrinsic::amdgcn_frexp_mant:
14125 case Intrinsic::amdgcn_fdot2:
14126 case Intrinsic::amdgcn_rcp:
14127 case Intrinsic::amdgcn_rsq:
14128 case Intrinsic::amdgcn_rsq_clamp:
14129 case Intrinsic::amdgcn_rcp_legacy:
14130 case Intrinsic::amdgcn_rsq_legacy:
14131 case Intrinsic::amdgcn_trig_preop:
14132 case Intrinsic::amdgcn_tanh:
14133 case Intrinsic::amdgcn_log:
14134 case Intrinsic::amdgcn_exp2:
14135 case Intrinsic::amdgcn_sqrt:
14153 unsigned MaxDepth)
const {
14156 unsigned Opcode =
MI->getOpcode();
14158 if (Opcode == AMDGPU::G_FCANONICALIZE)
14161 std::optional<FPValueAndVReg> FCR;
14164 if (FCR->Value.isSignaling())
14166 if (!FCR->Value.isDenormal())
14177 case AMDGPU::G_FADD:
14178 case AMDGPU::G_FSUB:
14179 case AMDGPU::G_FMUL:
14180 case AMDGPU::G_FCEIL:
14181 case AMDGPU::G_FFLOOR:
14182 case AMDGPU::G_FRINT:
14183 case AMDGPU::G_FNEARBYINT:
14184 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14185 case AMDGPU::G_INTRINSIC_TRUNC:
14186 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14187 case AMDGPU::G_FMA:
14188 case AMDGPU::G_FMAD:
14189 case AMDGPU::G_FSQRT:
14190 case AMDGPU::G_FDIV:
14191 case AMDGPU::G_FREM:
14192 case AMDGPU::G_FPOW:
14193 case AMDGPU::G_FPEXT:
14194 case AMDGPU::G_FLOG:
14195 case AMDGPU::G_FLOG2:
14196 case AMDGPU::G_FLOG10:
14197 case AMDGPU::G_FPTRUNC:
14198 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14199 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14200 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14201 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14202 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14204 case AMDGPU::G_FNEG:
14205 case AMDGPU::G_FABS:
14206 case AMDGPU::G_FCOPYSIGN:
14208 case AMDGPU::G_FMINNUM:
14209 case AMDGPU::G_FMAXNUM:
14210 case AMDGPU::G_FMINNUM_IEEE:
14211 case AMDGPU::G_FMAXNUM_IEEE:
14212 case AMDGPU::G_FMINIMUM:
14213 case AMDGPU::G_FMAXIMUM:
14214 case AMDGPU::G_FMINIMUMNUM:
14215 case AMDGPU::G_FMAXIMUMNUM: {
14216 if (Subtarget->supportsMinMaxDenormModes() ||
14223 case AMDGPU::G_BUILD_VECTOR:
14228 case AMDGPU::G_INTRINSIC:
14229 case AMDGPU::G_INTRINSIC_CONVERGENT:
14231 case Intrinsic::amdgcn_fmul_legacy:
14232 case Intrinsic::amdgcn_fmad_ftz:
14233 case Intrinsic::amdgcn_sqrt:
14234 case Intrinsic::amdgcn_fmed3:
14235 case Intrinsic::amdgcn_sin:
14236 case Intrinsic::amdgcn_cos:
14237 case Intrinsic::amdgcn_log:
14238 case Intrinsic::amdgcn_exp2:
14239 case Intrinsic::amdgcn_log_clamp:
14240 case Intrinsic::amdgcn_rcp:
14241 case Intrinsic::amdgcn_rcp_legacy:
14242 case Intrinsic::amdgcn_rsq:
14243 case Intrinsic::amdgcn_rsq_clamp:
14244 case Intrinsic::amdgcn_rsq_legacy:
14245 case Intrinsic::amdgcn_div_scale:
14246 case Intrinsic::amdgcn_div_fmas:
14247 case Intrinsic::amdgcn_div_fixup:
14248 case Intrinsic::amdgcn_fract:
14249 case Intrinsic::amdgcn_cvt_pkrtz:
14250 case Intrinsic::amdgcn_cubeid:
14251 case Intrinsic::amdgcn_cubema:
14252 case Intrinsic::amdgcn_cubesc:
14253 case Intrinsic::amdgcn_cubetc:
14254 case Intrinsic::amdgcn_frexp_mant:
14255 case Intrinsic::amdgcn_fdot2:
14256 case Intrinsic::amdgcn_trig_preop:
14257 case Intrinsic::amdgcn_tanh:
14276 if (
C.isDenormal()) {
14290 if (
C.isSignaling()) {
14313SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
14314 DAGCombinerInfo &DCI)
const {
14315 SelectionDAG &DAG = DCI.DAG;
14317 EVT VT =
N->getValueType(0);
14326 EVT VT =
N->getValueType(0);
14327 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
14343 EVT EltVT =
Lo.getValueType();
14346 for (
unsigned I = 0;
I != 2; ++
I) {
14350 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14351 }
else if (
Op.isUndef()) {
14385 case ISD::FMAXNUM_IEEE:
14386 case ISD::FMAXIMUMNUM:
14388 case ISD::FMAXIMUM:
14395 case ISD::FMINNUM_IEEE:
14396 case ISD::FMINIMUMNUM:
14398 case ISD::FMINIMUM:
14424 if (!MinK || !MaxK)
14437 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14438 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14497 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
14503 if (
Info->getMode().DX10Clamp) {
14512 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14540 case ISD::FMINNUM_IEEE:
14541 case ISD::FMAXNUM_IEEE:
14542 case ISD::FMINIMUMNUM:
14543 case ISD::FMAXIMUMNUM:
14546 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
14548 case ISD::FMINIMUM:
14549 case ISD::FMAXIMUM:
14557 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
14566 DAGCombinerInfo &DCI)
const {
14567 SelectionDAG &DAG = DCI.DAG;
14599 if (
SDValue Med3 = performIntMed3ImmCombine(
14604 if (
SDValue Med3 = performIntMed3ImmCombine(
14610 if (
SDValue Med3 = performIntMed3ImmCombine(
14615 if (
SDValue Med3 = performIntMed3ImmCombine(
14625 if (((
Opc == ISD::FMINNUM && Op0.
getOpcode() == ISD::FMAXNUM) ||
14626 (
Opc == ISD::FMINNUM_IEEE && Op0.
getOpcode() == ISD::FMAXNUM_IEEE) ||
14627 (
Opc == ISD::FMINIMUMNUM && Op0.
getOpcode() == ISD::FMAXIMUMNUM) ||
14630 (VT == MVT::f32 || VT == MVT::f64 ||
14631 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14632 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14633 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14634 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14636 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1))
14643 const SDNodeFlags
Flags =
N->getFlags();
14644 if ((
Opc == ISD::FMINIMUM ||
Opc == ISD::FMAXIMUM) &&
14645 !Subtarget->hasIEEEMinimumMaximumInsts() &&
Flags.hasNoNaNs()) {
14647 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14648 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
14658 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
14659 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
14668 DAGCombinerInfo &DCI)
const {
14669 EVT VT =
N->getValueType(0);
14673 SelectionDAG &DAG = DCI.DAG;
14688 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
14692 if (
Info->getMode().DX10Clamp) {
14712 DAGCombinerInfo &DCI)
const {
14716 return DCI.DAG.getUNDEF(
N->getValueType(0));
14724 bool IsDivergentIdx,
14729 unsigned VecSize = EltSize * NumElem;
14732 if (VecSize <= 64 && EltSize < 32)
14741 if (IsDivergentIdx)
14745 unsigned NumInsts = NumElem +
14746 ((EltSize + 31) / 32) * NumElem ;
14750 if (Subtarget->useVGPRIndexMode())
14751 return NumInsts <= 16;
14755 if (Subtarget->hasMovrel())
14756 return NumInsts <= 15;
14762 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
14777SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
14778 DAGCombinerInfo &DCI)
const {
14784 EVT ResVT =
N->getValueType(0);
14803 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
14822 case ISD::FMAXNUM_IEEE:
14823 case ISD::FMINNUM_IEEE:
14824 case ISD::FMAXIMUM:
14825 case ISD::FMINIMUM: {
14831 DCI.AddToWorklist(Elt0.
getNode());
14832 DCI.AddToWorklist(Elt1.
getNode());
14854 if (!DCI.isBeforeLegalize())
14862 VecSize > 32 && VecSize % 32 == 0 && Idx) {
14865 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
14866 unsigned EltIdx = BitIndex / 32;
14867 unsigned LeftoverBitIdx = BitIndex % 32;
14871 DCI.AddToWorklist(Cast.
getNode());
14875 DCI.AddToWorklist(Elt.
getNode());
14878 DCI.AddToWorklist(Srl.
getNode());
14882 DCI.AddToWorklist(Trunc.
getNode());
14884 if (VecEltVT == ResVT) {
14885 return DAG.
getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
14896SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
14897 DAGCombinerInfo &DCI)
const {
14908 SelectionDAG &DAG = DCI.DAG;
14927 if (Src.getOpcode() == ISD::FP_EXTEND &&
14928 Src.getOperand(0).getValueType() == MVT::f16) {
14929 return Src.getOperand(0);
14933 APFloat Val = CFP->getValueAPF();
14934 bool LosesInfo =
true;
14944 DAGCombinerInfo &DCI)
const {
14945 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
14946 "combine only useful on gfx8");
14948 SDValue TruncSrc =
N->getOperand(0);
14949 EVT VT =
N->getValueType(0);
14950 if (VT != MVT::f16)
14957 SelectionDAG &DAG = DCI.DAG;
14985 return DAG.
getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
14988unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
14990 const SDNode *N1)
const {
14995 if (((VT == MVT::f32 &&
14997 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15017 EVT VT =
N->getValueType(0);
15018 if (VT != MVT::i32 && VT != MVT::i64)
15024 unsigned Opc =
N->getOpcode();
15079 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
15098 DAGCombinerInfo &DCI)
const {
15101 SelectionDAG &DAG = DCI.DAG;
15102 EVT VT =
N->getValueType(0);
15112 if (!
N->isDivergent() && Subtarget->hasSMulHi())
15116 if (NumBits <= 32 || NumBits > 64)
15127 if (!Subtarget->hasFullRate64Ops()) {
15128 unsigned NumUsers = 0;
15129 for (SDNode *User :
LHS->
users()) {
15132 if (!
User->isAnyAdd())
15156 bool MulSignedLo =
false;
15157 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15166 if (VT != MVT::i64) {
15189 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15191 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15192 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15194 if (!MulLHSUnsigned32) {
15201 if (!MulRHSUnsigned32) {
15212 if (VT != MVT::i64)
15218SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
15219 DAGCombinerInfo &DCI)
const {
15229 SelectionDAG &DAG = DCI.DAG;
15244 unsigned Opcode =
N->getOpcode();
15245 if (Opcode == ISD::PTRADD)
15248 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
15259static std::optional<ByteProvider<SDValue>>
15262 if (!Byte0 || Byte0->isConstantZero()) {
15263 return std::nullopt;
15266 if (Byte1 && !Byte1->isConstantZero()) {
15267 return std::nullopt;
15273 unsigned FirstCs =
First & 0x0c0c0c0c;
15274 unsigned SecondCs = Second & 0x0c0c0c0c;
15275 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
15276 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15278 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15279 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15280 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15281 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15283 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15307 for (
int BPI = 0; BPI < 2; BPI++) {
15310 BPP = {Src1, Src0};
15312 unsigned ZeroMask = 0x0c0c0c0c;
15313 unsigned FMask = 0xFF << (8 * (3 - Step));
15315 unsigned FirstMask =
15316 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15317 unsigned SecondMask =
15318 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15322 int FirstGroup = -1;
15323 for (
int I = 0;
I < 2;
I++) {
15325 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
15326 return IterElt.SrcOp == *BPP.first.Src &&
15327 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15331 if (Match != Srcs.
end()) {
15332 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
15337 if (FirstGroup != -1) {
15339 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
15340 return IterElt.SrcOp == *BPP.second.Src &&
15341 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15344 if (Match != Srcs.
end()) {
15345 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
15347 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15355 unsigned ZeroMask = 0x0c0c0c0c;
15356 unsigned FMask = 0xFF << (8 * (3 - Step));
15360 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15364 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15373 if (Srcs.
size() == 1) {
15374 auto *Elt = Srcs.
begin();
15378 if (Elt->PermMask == 0x3020100)
15385 auto *FirstElt = Srcs.
begin();
15386 auto *SecondElt = std::next(FirstElt);
15393 auto FirstMask = FirstElt->PermMask;
15394 auto SecondMask = SecondElt->PermMask;
15396 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15397 unsigned FirstPlusFour = FirstMask | 0x04040404;
15400 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15412 FirstElt = std::next(SecondElt);
15413 if (FirstElt == Srcs.
end())
15416 SecondElt = std::next(FirstElt);
15419 if (SecondElt == Srcs.
end()) {
15425 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
15431 return Perms.
size() == 2
15437 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15438 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15439 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15440 EntryMask += ZeroMask;
15445 auto Opcode =
Op.getOpcode();
15451static std::optional<bool>
15462 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15465 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15467 assert(!(S0IsUnsigned && S0IsSigned));
15468 assert(!(S1IsUnsigned && S1IsSigned));
15476 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15482 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15483 return std::nullopt;
15495 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15496 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15501 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15507 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15508 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15509 return std::nullopt;
15515 DAGCombinerInfo &DCI)
const {
15516 SelectionDAG &DAG = DCI.DAG;
15517 EVT VT =
N->getValueType(0);
15523 if (Subtarget->hasMad64_32()) {
15524 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
15529 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
15533 if (VT == MVT::i64) {
15534 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15539 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15541 std::optional<bool> IsSigned;
15547 int ChainLength = 0;
15548 for (
int I = 0;
I < 4;
I++) {
15552 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15555 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15560 TempNode->getOperand(MulIdx), *Src0, *Src1,
15561 TempNode->getOperand(MulIdx)->getOperand(0),
15562 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15566 IsSigned = *IterIsSigned;
15567 if (*IterIsSigned != *IsSigned)
15570 auto AddIdx = 1 - MulIdx;
15573 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
15574 Src2s.
push_back(TempNode->getOperand(AddIdx));
15584 TempNode->getOperand(AddIdx), *Src0, *Src1,
15585 TempNode->getOperand(AddIdx)->getOperand(0),
15586 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15590 if (*IterIsSigned != *IsSigned)
15594 ChainLength =
I + 2;
15598 TempNode = TempNode->getOperand(AddIdx);
15600 ChainLength =
I + 1;
15601 if (TempNode->getNumOperands() < 2)
15603 LHS = TempNode->getOperand(0);
15604 RHS = TempNode->getOperand(1);
15607 if (ChainLength < 2)
15613 if (ChainLength < 4) {
15623 bool UseOriginalSrc =
false;
15624 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
15625 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
15626 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
15627 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
15628 SmallVector<unsigned, 4> SrcBytes;
15629 auto Src0Mask = Src0s.
begin()->PermMask;
15630 SrcBytes.
push_back(Src0Mask & 0xFF000000);
15631 bool UniqueEntries =
true;
15632 for (
auto I = 1;
I < 4;
I++) {
15633 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
15636 UniqueEntries =
false;
15642 if (UniqueEntries) {
15643 UseOriginalSrc =
true;
15645 auto *FirstElt = Src0s.
begin();
15649 auto *SecondElt = Src1s.
begin();
15651 SecondElt->DWordOffset);
15660 if (!UseOriginalSrc) {
15667 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
15670 : Intrinsic::amdgcn_udot4,
15680 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
15685 unsigned Opc =
LHS.getOpcode();
15697 auto Cond =
RHS.getOperand(0);
15702 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
15719 DAGCombinerInfo &DCI)
const {
15720 SelectionDAG &DAG = DCI.DAG;
15722 EVT VT =
N->getValueType(0);
15735 SDNodeFlags ShlFlags = N1->
getFlags();
15739 SDNodeFlags NewShlFlags =
15744 DCI.AddToWorklist(Inner.
getNode());
15751 if (Subtarget->hasMad64_32()) {
15752 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
15761 if (VT == MVT::i64) {
15762 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15770 if (
const GlobalAddressSDNode *GA =
15775 SDNodeFlags
Flags =
15778 DCI.AddToWorklist(Inner.
getNode());
15806 SDNodeFlags ReassocFlags =
15809 if (ZIsConstant != YIsConstant) {
15813 DCI.AddToWorklist(Inner.
getNode());
15821 assert(!YIsConstant && !ZIsConstant);
15823 if (!
X->isDivergent() &&
Y->isDivergent() !=
Z->isDivergent()) {
15832 if (
Y->isDivergent())
15835 DCI.AddToWorklist(UniformInner.
getNode());
15843 DAGCombinerInfo &DCI)
const {
15844 SelectionDAG &DAG = DCI.DAG;
15845 EVT VT =
N->getValueType(0);
15847 if (VT == MVT::i64) {
15848 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15852 if (VT != MVT::i32)
15861 unsigned Opc =
RHS.getOpcode();
15868 auto Cond =
RHS.getOperand(0);
15873 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
15891SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
15892 DAGCombinerInfo &DCI)
const {
15894 if (
N->getValueType(0) != MVT::i32)
15900 SelectionDAG &DAG = DCI.DAG;
15905 unsigned LHSOpc =
LHS.getOpcode();
15906 unsigned Opc =
N->getOpcode();
15910 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
15916 DAGCombinerInfo &DCI)
const {
15920 SelectionDAG &DAG = DCI.DAG;
15921 EVT VT =
N->getValueType(0);
15933 if (
A ==
LHS.getOperand(1)) {
15934 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
15935 if (FusedOp != 0) {
15937 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
15945 if (
A ==
RHS.getOperand(1)) {
15946 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
15947 if (FusedOp != 0) {
15949 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
15958 DAGCombinerInfo &DCI)
const {
15962 SelectionDAG &DAG = DCI.DAG;
15964 EVT VT =
N->getValueType(0);
15977 if (
A ==
LHS.getOperand(1)) {
15978 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
15979 if (FusedOp != 0) {
15983 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
15992 if (
A ==
RHS.getOperand(1)) {
15993 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
15994 if (FusedOp != 0) {
15996 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
16005 DAGCombinerInfo &DCI)
const {
16006 SelectionDAG &DAG = DCI.DAG;
16008 EVT VT =
N->getValueType(0);
16009 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16015 SDNodeFlags
Flags =
N->getFlags();
16016 SDNodeFlags RHSFlags =
RHS->getFlags();
16022 bool IsNegative =
false;
16023 if (CLHS->isExactlyValue(1.0) ||
16024 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16027 if (
RHS.getOpcode() == ISD::FSQRT) {
16031 return IsNegative ? DAG.
getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16040 DAGCombinerInfo &DCI)
const {
16041 SelectionDAG &DAG = DCI.DAG;
16042 EVT VT =
N->getValueType(0);
16046 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
16047 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16062 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16067 const ConstantFPSDNode *FalseNode =
16077 if (ScalarVT == MVT::f32 &&
16083 if (TrueNodeExpVal == INT_MIN)
16086 if (FalseNodeExpVal == INT_MIN)
16099 return DAG.
getNode(ISD::FLDEXP, SL, VT,
LHS, SelectNode,
N->getFlags());
16106 DAGCombinerInfo &DCI)
const {
16107 SelectionDAG &DAG = DCI.DAG;
16108 EVT VT =
N->getValueType(0);
16111 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16129 (
N->getFlags().hasAllowContract() &&
16130 FMA->getFlags().hasAllowContract())) {
16145 if (FMAOp1.
getOpcode() != ISD::FP_EXTEND ||
16164 if (Vec1 == Vec2 || Vec3 == Vec4)
16170 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16179 DAGCombinerInfo &DCI)
const {
16180 SelectionDAG &DAG = DCI.DAG;
16185 EVT VT =
LHS.getValueType();
16214 return LHS.getOperand(0);
16222 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
16229 const APInt &CT =
LHS.getConstantOperandAPInt(1);
16230 const APInt &CF =
LHS.getConstantOperandAPInt(2);
16238 return LHS.getOperand(0);
16242 if (VT != MVT::f32 && VT != MVT::f64 &&
16243 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16251 LHS.getOpcode() == ISD::FABS) {
16258 const unsigned IsInfMask =
16260 const unsigned IsFiniteMask =
16274SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
16275 DAGCombinerInfo &DCI)
const {
16276 SelectionDAG &DAG = DCI.DAG;
16297 unsigned ShiftOffset = 8 *
Offset;
16299 ShiftOffset -=
C->getZExtValue();
16301 ShiftOffset +=
C->getZExtValue();
16303 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16305 MVT::f32, Shifted);
16316 DCI.AddToWorklist(
N);
16323 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
16329 DAGCombinerInfo &DCI)
const {
16334 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16338 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16339 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
16342 APFloat One(
F.getSemantics(),
"1.0");
16344 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
16350 DAGCombinerInfo &DCI)
const {
16371 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
16372 bool isInteger =
LHS.getValueType().isInteger();
16375 if (!isFloatingPoint && !isInteger)
16380 if (!isEquality && !isNonEquality)
16397 if (isFloatingPoint) {
16399 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16410 if (!(isEquality && TrueVal == ConstVal) &&
16411 !(isNonEquality && FalseVal == ConstVal))
16418 SelectLHS, SelectRHS);
16423 switch (
N->getOpcode()) {
16439 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
16449 switch (
N->getOpcode()) {
16451 return performAddCombine(
N, DCI);
16453 return performPtrAddCombine(
N, DCI);
16455 return performSubCombine(
N, DCI);
16458 return performAddCarrySubCarryCombine(
N, DCI);
16460 return performFAddCombine(
N, DCI);
16462 return performFSubCombine(
N, DCI);
16464 return performFDivCombine(
N, DCI);
16466 return performFMulCombine(
N, DCI);
16468 return performSetCCCombine(
N, DCI);
16470 if (
auto Res = performSelectCombine(
N, DCI))
16475 case ISD::FMAXNUM_IEEE:
16476 case ISD::FMINNUM_IEEE:
16477 case ISD::FMAXIMUM:
16478 case ISD::FMINIMUM:
16479 case ISD::FMAXIMUMNUM:
16480 case ISD::FMINIMUMNUM:
16487 return performMinMaxCombine(
N, DCI);
16489 return performFMACombine(
N, DCI);
16491 return performAndCombine(
N, DCI);
16493 return performOrCombine(
N, DCI);
16496 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
16497 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16503 return performXorCombine(
N, DCI);
16505 return performZeroExtendCombine(
N, DCI);
16507 return performSignExtendInRegCombine(
N, DCI);
16509 return performClassCombine(
N, DCI);
16511 return performFCanonicalizeCombine(
N, DCI);
16513 return performRcpCombine(
N, DCI);
16528 return performUCharToFloatCombine(
N, DCI);
16530 return performFCopySignCombine(
N, DCI);
16535 return performCvtF32UByteNCombine(
N, DCI);
16537 return performFMed3Combine(
N, DCI);
16539 return performCvtPkRTZCombine(
N, DCI);
16541 return performClampCombine(
N, DCI);
16544 EVT VT =
N->getValueType(0);
16547 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16550 EVT EltVT = Src.getValueType();
16551 if (EltVT != MVT::i16)
16552 Src = DAG.
getNode(ISD::BITCAST, SL, MVT::i16, Src);
16555 return DAG.
getNode(ISD::BITCAST, SL, VT, Ext);
16561 return performExtractVectorEltCombine(
N, DCI);
16563 return performInsertVectorEltCombine(
N, DCI);
16565 return performFPRoundCombine(
N, DCI);
16574 return performMemSDNodeCombine(MemNode, DCI);
16605 unsigned Opcode =
Node->getMachineOpcode();
16608 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16609 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
16612 SDNode *
Users[5] = {
nullptr};
16614 unsigned DmaskIdx =
16615 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16616 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
16617 unsigned NewDmask = 0;
16618 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16619 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16620 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
16621 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
16622 unsigned TFCLane = 0;
16623 bool HasChain =
Node->getNumValues() > 1;
16625 if (OldDmask == 0) {
16633 TFCLane = OldBitsSet;
16637 for (SDUse &Use :
Node->uses()) {
16640 if (
Use.getResNo() != 0)
16643 SDNode *
User =
Use.getUser();
16646 if (!
User->isMachineOpcode() ||
16647 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
16659 if (UsesTFC && Lane == TFCLane) {
16664 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
16666 Dmask &= ~(1 << Comp);
16674 NewDmask |= 1 << Comp;
16679 bool NoChannels = !NewDmask;
16686 if (OldBitsSet == 1)
16692 if (NewDmask == OldDmask)
16701 unsigned NewChannels = BitsSet + UsesTFC;
16705 assert(NewOpcode != -1 &&
16706 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
16707 "failed to find equivalent MIMG op");
16715 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
16717 MVT ResultVT = NewChannels == 1
16720 : NewChannels == 5 ? 8
16722 SDVTList NewVTList =
16725 MachineSDNode *NewNode =
16734 if (NewChannels == 1) {
16744 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
16749 if (i || !NoChannels)
16754 if (NewUser != User) {
16764 Idx = AMDGPU::sub1;
16767 Idx = AMDGPU::sub2;
16770 Idx = AMDGPU::sub3;
16773 Idx = AMDGPU::sub4;
16784 Op =
Op.getOperand(0);
16805 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
16809 Node->getOperand(0), SL, VReg, SrcVal,
16815 return ToResultReg.
getNode();
16820 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
16822 Ops.push_back(
Node->getOperand(i));
16828 Node->getOperand(i).getValueType(),
16829 Node->getOperand(i)),
16841 unsigned Opcode =
Node->getMachineOpcode();
16843 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
16844 !
TII->isGather4(Opcode) &&
16846 return adjustWritemask(
Node, DAG);
16849 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
16855 case AMDGPU::V_DIV_SCALE_F32_e64:
16856 case AMDGPU::V_DIV_SCALE_F64_e64: {
16866 (Src0 == Src1 || Src0 == Src2))
16922 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
16923 unsigned InitIdx = 0;
16925 if (
TII->isImage(
MI)) {
16933 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
16934 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
16935 unsigned D16Val = D16 ? D16->getImm() : 0;
16937 if (!TFEVal && !LWEVal)
16948 assert(MO_Dmask &&
"Expected dmask operand in instruction");
16950 unsigned dmask = MO_Dmask->
getImm();
16955 bool Packed = !Subtarget->hasUnpackedD16VMem();
16957 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
16963 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
16964 if (DstSize < InitIdx)
16967 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
16975 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
16976 unsigned NewDst = 0;
16981 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
16982 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
16985 for (; SizeLeft; SizeLeft--, CurrIdx++) {
16986 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
17006 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
17019 if (
TII->isVOP3(
MI.getOpcode())) {
17021 TII->legalizeOperandsVOP3(
MRI,
MI);
17026 if (!
MI.getDesc().operands().empty()) {
17027 unsigned Opc =
MI.getOpcode();
17028 bool HasAGPRs = Info->mayNeedAGPRs();
17030 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
17032 {AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src0),
17033 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1), Src2Idx}) {
17036 if ((
I == Src2Idx) && (HasAGPRs))
17039 if (!
Op.isReg() || !
Op.getReg().isVirtual())
17041 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
17042 if (!
TRI->hasAGPRs(RC))
17044 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
17045 if (!Src || !Src->isCopy() ||
17046 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
17048 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
17052 MRI.setRegClass(
Op.getReg(), NewRC);
17055 if (
TII->isMAI(
MI)) {
17060 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17061 AMDGPU::OpName::scale_src0);
17062 if (Src0Idx != -1) {
17063 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17064 AMDGPU::OpName::scale_src1);
17065 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
17066 TII->usesConstantBus(
MRI,
MI, Src1Idx))
17067 TII->legalizeOpWithMove(
MI, Src1Idx);
17075 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
17076 if (Src2->isReg() && Src2->getReg().isVirtual()) {
17077 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
17078 if (
TRI->isVectorSuperClass(RC)) {
17079 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
17080 MRI.setRegClass(Src2->getReg(), NewRC);
17081 if (Src2->isTied())
17082 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
17091 if (
TII->isImage(
MI))
17092 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
17166std::pair<unsigned, const TargetRegisterClass *>
17173 if (Constraint.
size() == 1) {
17177 if (VT == MVT::Other)
17180 switch (Constraint[0]) {
17187 RC = &AMDGPU::SReg_32RegClass;
17190 RC = &AMDGPU::SGPR_64RegClass;
17195 return std::pair(0U,
nullptr);
17202 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17203 : &AMDGPU::VGPR_32_Lo256RegClass;
17206 RC = Subtarget->has1024AddressableVGPRs()
17207 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
17210 return std::pair(0U,
nullptr);
17215 if (!Subtarget->hasMAIInsts())
17219 RC = &AMDGPU::AGPR_32RegClass;
17224 return std::pair(0U,
nullptr);
17229 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
17233 RC = &AMDGPU::AV_32RegClass;
17236 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
17238 return std::pair(0U,
nullptr);
17247 return std::pair(0U, RC);
17250 if (Kind !=
'\0') {
17252 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17253 }
else if (Kind ==
's') {
17254 RC = &AMDGPU::SGPR_32RegClass;
17255 }
else if (Kind ==
'a') {
17256 RC = &AMDGPU::AGPR_32RegClass;
17262 return std::pair(0U,
nullptr);
17268 return std::pair(0U,
nullptr);
17272 RC =
TRI->getVGPRClassForBitWidth(Width);
17274 RC =
TRI->getSGPRClassForBitWidth(Width);
17276 RC =
TRI->getAGPRClassForBitWidth(Width);
17278 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17283 return std::pair(0U,
nullptr);
17285 return std::pair(Reg, RC);
17291 return std::pair(0U,
nullptr);
17292 if (Idx < RC->getNumRegs())
17294 return std::pair(0U,
nullptr);
17300 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
17306 if (Constraint.
size() == 1) {
17307 switch (Constraint[0]) {
17317 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
17325 if (Constraint.
size() == 1) {
17326 switch (Constraint[0]) {
17334 }
else if (Constraint.
size() == 2) {
17335 if (Constraint ==
"VA")
17353 std::vector<SDValue> &
Ops,
17368 unsigned Size =
Op.getScalarValueSizeInBits();
17372 if (
Size == 16 && !Subtarget->has16BitInsts())
17376 Val =
C->getSExtValue();
17380 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17384 if (
Size != 16 ||
Op.getNumOperands() != 2)
17386 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
17389 Val =
C->getSExtValue();
17393 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17403 if (Constraint.
size() == 1) {
17404 switch (Constraint[0]) {
17419 }
else if (Constraint.
size() == 2) {
17420 if (Constraint ==
"DA") {
17421 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
17422 int64_t LoBits =
static_cast<int32_t
>(Val);
17426 if (Constraint ==
"DB") {
17434 unsigned MaxSize)
const {
17435 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
17436 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17438 MVT VT =
Op.getSimpleValueType();
17463 switch (UnalignedClassID) {
17464 case AMDGPU::VReg_64RegClassID:
17465 return AMDGPU::VReg_64_Align2RegClassID;
17466 case AMDGPU::VReg_96RegClassID:
17467 return AMDGPU::VReg_96_Align2RegClassID;
17468 case AMDGPU::VReg_128RegClassID:
17469 return AMDGPU::VReg_128_Align2RegClassID;
17470 case AMDGPU::VReg_160RegClassID:
17471 return AMDGPU::VReg_160_Align2RegClassID;
17472 case AMDGPU::VReg_192RegClassID:
17473 return AMDGPU::VReg_192_Align2RegClassID;
17474 case AMDGPU::VReg_224RegClassID:
17475 return AMDGPU::VReg_224_Align2RegClassID;
17476 case AMDGPU::VReg_256RegClassID:
17477 return AMDGPU::VReg_256_Align2RegClassID;
17478 case AMDGPU::VReg_288RegClassID:
17479 return AMDGPU::VReg_288_Align2RegClassID;
17480 case AMDGPU::VReg_320RegClassID:
17481 return AMDGPU::VReg_320_Align2RegClassID;
17482 case AMDGPU::VReg_352RegClassID:
17483 return AMDGPU::VReg_352_Align2RegClassID;
17484 case AMDGPU::VReg_384RegClassID:
17485 return AMDGPU::VReg_384_Align2RegClassID;
17486 case AMDGPU::VReg_512RegClassID:
17487 return AMDGPU::VReg_512_Align2RegClassID;
17488 case AMDGPU::VReg_1024RegClassID:
17489 return AMDGPU::VReg_1024_Align2RegClassID;
17490 case AMDGPU::AReg_64RegClassID:
17491 return AMDGPU::AReg_64_Align2RegClassID;
17492 case AMDGPU::AReg_96RegClassID:
17493 return AMDGPU::AReg_96_Align2RegClassID;
17494 case AMDGPU::AReg_128RegClassID:
17495 return AMDGPU::AReg_128_Align2RegClassID;
17496 case AMDGPU::AReg_160RegClassID:
17497 return AMDGPU::AReg_160_Align2RegClassID;
17498 case AMDGPU::AReg_192RegClassID:
17499 return AMDGPU::AReg_192_Align2RegClassID;
17500 case AMDGPU::AReg_256RegClassID:
17501 return AMDGPU::AReg_256_Align2RegClassID;
17502 case AMDGPU::AReg_512RegClassID:
17503 return AMDGPU::AReg_512_Align2RegClassID;
17504 case AMDGPU::AReg_1024RegClassID:
17505 return AMDGPU::AReg_1024_Align2RegClassID;
17521 if (Info->isEntryFunction()) {
17528 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17530 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17531 :
TRI->getAlignedHighSGPRForRC(MF, 2,
17532 &AMDGPU::SGPR_64RegClass);
17533 Info->setSGPRForEXECCopy(SReg);
17535 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
17536 Info->getStackPtrOffsetReg()));
17537 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17538 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17542 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17543 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17545 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17546 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17548 Info->limitOccupancy(MF);
17550 if (ST.isWave32() && !MF.
empty()) {
17551 for (
auto &
MBB : MF) {
17552 for (
auto &
MI :
MBB) {
17553 TII->fixImplicitOperands(
MI);
17563 if (ST.needsAlignedVGPRs()) {
17564 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
17570 if (NewClassID != -1)
17571 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
17580 const APInt &DemandedElts,
17582 unsigned Depth)
const {
17584 unsigned Opc =
Op.getOpcode();
17587 unsigned IID =
Op.getConstantOperandVal(0);
17589 case Intrinsic::amdgcn_mbcnt_lo:
17590 case Intrinsic::amdgcn_mbcnt_hi: {
17596 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17606 Op, Known, DemandedElts, DAG,
Depth);
17622 unsigned MaxValue =
17629 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
17633 unsigned Src1Cst = 0;
17634 if (Src1.
isImm()) {
17635 Src1Cst = Src1.
getImm();
17636 }
else if (Src1.
isReg()) {
17640 Src1Cst = Cst->Value.getZExtValue();
17651 if (Width >= BFEWidth)
17660 Known = Known.
sext(BFEWidth);
17662 Known = Known.
zext(BFEWidth);
17668 unsigned Depth)
const {
17671 switch (
MI->getOpcode()) {
17672 case AMDGPU::S_BFE_I32:
17675 case AMDGPU::S_BFE_U32:
17678 case AMDGPU::S_BFE_I64:
17681 case AMDGPU::S_BFE_U64:
17684 case AMDGPU::G_INTRINSIC:
17685 case AMDGPU::G_INTRINSIC_CONVERGENT: {
17688 case Intrinsic::amdgcn_workitem_id_x:
17691 case Intrinsic::amdgcn_workitem_id_y:
17694 case Intrinsic::amdgcn_workitem_id_z:
17697 case Intrinsic::amdgcn_mbcnt_lo:
17698 case Intrinsic::amdgcn_mbcnt_hi: {
17710 case Intrinsic::amdgcn_groupstaticsize: {
17721 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
17724 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
17727 case AMDGPU::G_AMDGPU_SMED3:
17728 case AMDGPU::G_AMDGPU_UMED3: {
17729 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
17756 unsigned Depth)
const {
17763 AttributeList Attrs =
17765 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
17792 if (Header->getAlignment() != PrefAlign)
17793 return Header->getAlignment();
17795 unsigned LoopSize = 0;
17800 LoopSize +=
MBB->getAlignment().value() / 2;
17803 LoopSize +=
TII->getInstSizeInBytes(
MI);
17804 if (LoopSize > 192)
17809 if (LoopSize <= 64)
17812 if (LoopSize <= 128)
17813 return CacheLineAlign;
17819 auto I = Exit->getFirstNonDebugInstr();
17820 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
17821 return CacheLineAlign;
17830 if (PreTerm == Pre->
begin() ||
17831 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
17835 auto ExitHead = Exit->getFirstNonDebugInstr();
17836 if (ExitHead == Exit->end() ||
17837 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
17842 return CacheLineAlign;
17850 N =
N->getOperand(0).getNode();
17851 if (
N->getOpcode() == ISD::INLINEASM ||
N->getOpcode() == ISD::INLINEASM_BR)
17860 switch (
N->getOpcode()) {
17868 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
17869 return !
TRI->isSGPRReg(
MRI, Reg);
17875 return !
TRI->isSGPRReg(
MRI, Reg);
17879 unsigned AS = L->getAddressSpace();
17883 case ISD::CALLSEQ_END:
17912 return A->readMem() &&
A->writeMem();
17933 switch (Ty.getScalarSizeInBits()) {
17945 const APInt &DemandedElts,
17948 unsigned Depth)
const {
17953 if (Info->getMode().DX10Clamp)
17965 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
17985 <<
"Hardware instruction generated for atomic "
17987 <<
" operation at memory scope " << MemScope;
17992 Type *EltTy = VT->getElementType();
17993 return VT->getNumElements() == 2 &&
18013 unsigned BW =
IT->getBitWidth();
18014 return BW == 32 || BW == 64;
18028 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
18029 return BW == 32 || BW == 64;
18032 if (Ty->isFloatTy() || Ty->isDoubleTy())
18036 return VT->getNumElements() == 2 &&
18037 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18047 bool HasSystemScope) {
18054 if (HasSystemScope) {
18063 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
18076 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
18102 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
18115 bool HasSystemScope =
18141 if (Subtarget->hasEmulatedSystemScopeAtomics())
18157 if (!HasSystemScope &&
18158 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18170 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
18178 ConstVal && ConstVal->isNullValue())
18216 if (Ty->isFloatTy()) {
18221 if (Ty->isDoubleTy()) {
18242 if (Ty->isFloatTy() &&
18243 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18256 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18260 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
18264 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18269 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
18274 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18278 if (Ty->isFloatTy()) {
18281 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18284 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18289 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18297 if (Subtarget->hasFlatAtomicFaddF32Inst())
18306 if (Subtarget->hasLDSFPAtomicAddF32()) {
18307 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18309 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18337 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18339 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18343 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18345 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18398 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18399 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18400 : &AMDGPU::SReg_32RegClass;
18401 if (!
TRI->isSGPRClass(RC) && !isDivergent)
18402 return TRI->getEquivalentSGPRClass(RC);
18403 if (
TRI->isSGPRClass(RC) && isDivergent)
18404 return TRI->getEquivalentVGPRClass(RC);
18416 unsigned WaveSize) {
18421 if (!
IT ||
IT->getBitWidth() != WaveSize)
18426 if (!Visited.
insert(V).second)
18428 bool Result =
false;
18429 for (
const auto *U : V->users()) {
18431 if (V == U->getOperand(1)) {
18436 case Intrinsic::amdgcn_if_break:
18437 case Intrinsic::amdgcn_if:
18438 case Intrinsic::amdgcn_else:
18443 if (V == U->getOperand(0)) {
18448 case Intrinsic::amdgcn_end_cf:
18449 case Intrinsic::amdgcn_loop:
18455 Result =
hasCFUser(U, Visited, WaveSize);
18464 const Value *V)
const {
18466 if (CI->isInlineAsm()) {
18475 for (
auto &TC : TargetConstraints) {
18489 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18517 return MRI.hasOneNonDBGUse(N0);
18524 if (
I.getMetadata(
"amdgpu.noclobber"))
18526 if (
I.getMetadata(
"amdgpu.last.use"))
18536 if (!Def->isMachineOpcode())
18546 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18547 PhysReg = AMDGPU::SCC;
18549 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18615 Alignment = RMW->getAlign();
18628 bool FullFlatEmulation =
18630 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18631 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18632 RMW->getType()->isDoubleTy()));
18635 bool ReturnValueIsUsed = !AI->
use_empty();
18644 if (FullFlatEmulation) {
18655 std::prev(BB->
end())->eraseFromParent();
18656 Builder.SetInsertPoint(BB);
18658 Value *LoadedShared =
nullptr;
18659 if (FullFlatEmulation) {
18660 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
18661 {Addr},
nullptr,
"is.shared");
18662 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
18663 Builder.SetInsertPoint(SharedBB);
18664 Value *CastToLocal = Builder.CreateAddrSpaceCast(
18670 LoadedShared = Clone;
18672 Builder.CreateBr(PhiBB);
18673 Builder.SetInsertPoint(CheckPrivateBB);
18676 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
18677 {Addr},
nullptr,
"is.private");
18678 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
18680 Builder.SetInsertPoint(PrivateBB);
18682 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
18685 Value *LoadedPrivate;
18687 LoadedPrivate = Builder.CreateAlignedLoad(
18688 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
18691 LoadedPrivate, RMW->getValOperand());
18693 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
18695 auto [ResultLoad, Equal] =
18701 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
18704 Builder.CreateBr(PhiBB);
18706 Builder.SetInsertPoint(GlobalBB);
18710 if (FullFlatEmulation) {
18711 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
18720 if (!FullFlatEmulation) {
18725 MDNode *RangeNotPrivate =
18728 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
18732 Builder.CreateBr(PhiBB);
18734 Builder.SetInsertPoint(PhiBB);
18736 if (ReturnValueIsUsed) {
18739 if (FullFlatEmulation)
18746 Builder.CreateBr(ExitBB);
18750 unsigned PtrOpIdx) {
18751 Value *PtrOp =
I->getOperand(PtrOpIdx);
18758 I->setOperand(PtrOpIdx, ASCast);
18770 ConstVal && ConstVal->isNullValue()) {
18800 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
18808 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
18823 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ TC_RETURN_GFX_WholeWave
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ SMULO
Same for multiplication.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const