40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
52#define DEBUG_TYPE "si-lower"
58 cl::desc(
"Do not align and prefetch loops"),
62 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
63 cl::desc(
"Use indirect register addressing for divergent indexes"),
70 cl::desc(
"Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
85 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
86 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
88 return AMDGPU::SGPR0 +
Reg;
160 if (Subtarget->has16BitInsts()) {
161 if (Subtarget->useRealTrue16Insts()) {
203 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
204 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
205 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
206 MVT::i1, MVT::v32i32},
210 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
211 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
212 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
213 MVT::i1, MVT::v32i32},
220 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
221 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
222 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
223 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
224 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
282 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
289 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
290 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
291 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
294 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
295 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
296 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
300 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
301 MVT::v3i16, MVT::v4i16, MVT::Other},
306 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
322 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
323 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
324 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
325 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
326 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
327 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
328 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
329 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
361 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
375 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
389 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
403 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
417 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
432 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
433 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
436 if (Subtarget->hasPkMovB32()) {
449 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
450 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
455 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
459 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
460 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
461 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
462 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
486 if (Subtarget->hasSMemRealTime() ||
491 if (Subtarget->has16BitInsts()) {
498 if (Subtarget->hasMadMacF32Insts())
501 if (!Subtarget->hasBFI())
505 if (!Subtarget->hasBCNT(32))
508 if (!Subtarget->hasBCNT(64))
511 if (Subtarget->hasFFBH())
514 if (Subtarget->hasFFBL())
525 if (Subtarget->hasBFE())
529 if (Subtarget->hasIntClamp())
532 if (Subtarget->hasAddNoCarry())
537 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
538 {MVT::f32, MVT::f64},
Custom);
544 {MVT::f32, MVT::f64},
Legal);
546 if (Subtarget->haveRoundOpsF64())
569 if (Subtarget->has16BitInsts()) {
618 ISD::FSIN, ISD::FROUND},
622 if (Subtarget->hasBF16TransInsts())
641 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
642 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
643 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
776 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
777 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
778 MVT::v32f16, MVT::v32bf16},
782 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
788 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
792 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
796 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
797 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
805 if (Subtarget->hasVOP3PInsts()) {
816 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
819 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
820 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
821 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
824 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
832 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
838 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
839 {MVT::v2f16, MVT::v4f16},
Custom);
845 if (Subtarget->hasPackedFP32Ops()) {
849 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
856 if (Subtarget->has16BitInsts()) {
869 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
870 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
871 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
872 MVT::v32f16, MVT::v32bf16},
877 if (Subtarget->hasVectorMulU64())
879 else if (Subtarget->hasScalarSMulU64())
882 if (Subtarget->hasMad64_32())
885 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
888 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
890 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
893 if (Subtarget->hasMinimum3Maximum3F32())
896 if (Subtarget->hasMinimum3Maximum3PKF16()) {
900 if (!Subtarget->hasMinimum3Maximum3F16())
905 if (Subtarget->hasVOP3PInsts()) {
908 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
912 if (Subtarget->hasIntMinMax64())
917 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
918 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
923 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
924 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
925 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
926 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
930 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
931 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
932 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
933 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
948 if (Subtarget->hasBF16ConversionInsts()) {
953 if (Subtarget->hasBF16PackedInsts()) {
959 if (Subtarget->hasBF16TransInsts()) {
963 if (Subtarget->hasCvtPkF16F32Inst()) {
965 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1015 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1024 ISD::ATOMIC_CMP_SWAP,
1025 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1027 ISD::ATOMIC_LOAD_ADD,
1028 ISD::ATOMIC_LOAD_SUB,
1029 ISD::ATOMIC_LOAD_AND,
1030 ISD::ATOMIC_LOAD_OR,
1031 ISD::ATOMIC_LOAD_XOR,
1032 ISD::ATOMIC_LOAD_NAND,
1033 ISD::ATOMIC_LOAD_MIN,
1034 ISD::ATOMIC_LOAD_MAX,
1035 ISD::ATOMIC_LOAD_UMIN,
1036 ISD::ATOMIC_LOAD_UMAX,
1037 ISD::ATOMIC_LOAD_FADD,
1038 ISD::ATOMIC_LOAD_FMIN,
1039 ISD::ATOMIC_LOAD_FMAX,
1040 ISD::ATOMIC_LOAD_UINC_WRAP,
1041 ISD::ATOMIC_LOAD_UDEC_WRAP,
1054 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1067 EVT DestVT,
EVT SrcVT)
const {
1069 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1070 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1072 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1079 LLT DestTy,
LLT SrcTy)
const {
1080 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1081 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1083 SrcTy.getScalarSizeInBits() == 16 &&
1104 if (Subtarget->has16BitInsts()) {
1107 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1109 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1113 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1135 if (
Size == 16 && Subtarget->has16BitInsts())
1136 return (NumElts + 1) / 2;
1142 return NumElts * ((
Size + 31) / 32);
1151 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1159 if (
Size == 16 && Subtarget->has16BitInsts()) {
1160 if (ScalarVT == MVT::bf16) {
1161 RegisterVT = MVT::i32;
1162 IntermediateVT = MVT::v2bf16;
1164 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1165 IntermediateVT = RegisterVT;
1167 NumIntermediates = (NumElts + 1) / 2;
1168 return NumIntermediates;
1173 IntermediateVT = RegisterVT;
1174 NumIntermediates = NumElts;
1175 return NumIntermediates;
1180 RegisterVT = MVT::i16;
1181 IntermediateVT = ScalarVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1187 RegisterVT = MVT::i32;
1188 IntermediateVT = ScalarVT;
1189 NumIntermediates = NumElts;
1190 return NumIntermediates;
1194 RegisterVT = MVT::i32;
1195 IntermediateVT = RegisterVT;
1196 NumIntermediates = NumElts * ((
Size + 31) / 32);
1197 return NumIntermediates;
1202 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1207 unsigned MaxNumLanes) {
1208 assert(MaxNumLanes != 0);
1212 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1223 unsigned MaxNumLanes) {
1229 assert(ST->getNumContainedTypes() == 2 &&
1230 ST->getContainedType(1)->isIntegerTy(32));
1244 return MVT::amdgpuBufferFatPointer;
1246 DL.getPointerSizeInBits(AS) == 192)
1247 return MVT::amdgpuBufferStridedPointer;
1256 DL.getPointerSizeInBits(AS) == 160) ||
1258 DL.getPointerSizeInBits(AS) == 192))
1265 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1266 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1267 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1269 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1270 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1271 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1272 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1273 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1275 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1276 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1277 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1278 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1279 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1281 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1282 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1283 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1284 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1285 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1324 unsigned IntrID)
const {
1326 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1344 if (RsrcIntr->IsImage) {
1359 Info.ptrVal = RsrcArg;
1362 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1371 if (RsrcIntr->IsImage) {
1372 unsigned MaxNumLanes = 4;
1387 std::numeric_limits<unsigned>::max());
1397 if (RsrcIntr->IsImage) {
1418 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1420 Info.memVT = MVT::i32;
1427 case Intrinsic::amdgcn_raw_buffer_load_lds:
1428 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1429 case Intrinsic::amdgcn_struct_buffer_load_lds:
1430 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1436 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1437 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1438 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1439 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1442 std::numeric_limits<unsigned>::max());
1452 case Intrinsic::amdgcn_ds_ordered_add:
1453 case Intrinsic::amdgcn_ds_ordered_swap: {
1466 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1467 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1470 Info.ptrVal =
nullptr;
1475 case Intrinsic::amdgcn_ds_append:
1476 case Intrinsic::amdgcn_ds_consume: {
1489 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1490 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1491 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1496 Info.memVT = MVT::i64;
1502 case Intrinsic::amdgcn_global_atomic_csub: {
1511 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1512 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1513 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1516 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1519 ->getElementType(0));
1527 case Intrinsic::amdgcn_global_atomic_fmin_num:
1528 case Intrinsic::amdgcn_global_atomic_fmax_num:
1529 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1530 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1531 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1532 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1542 case Intrinsic::amdgcn_flat_load_monitor_b32:
1543 case Intrinsic::amdgcn_flat_load_monitor_b64:
1544 case Intrinsic::amdgcn_flat_load_monitor_b128:
1545 case Intrinsic::amdgcn_global_load_monitor_b32:
1546 case Intrinsic::amdgcn_global_load_monitor_b64:
1547 case Intrinsic::amdgcn_global_load_monitor_b128:
1548 case Intrinsic::amdgcn_cluster_load_b32:
1549 case Intrinsic::amdgcn_cluster_load_b64:
1550 case Intrinsic::amdgcn_cluster_load_b128:
1551 case Intrinsic::amdgcn_ds_load_tr6_b96:
1552 case Intrinsic::amdgcn_ds_load_tr4_b64:
1553 case Intrinsic::amdgcn_ds_load_tr8_b64:
1554 case Intrinsic::amdgcn_ds_load_tr16_b128:
1555 case Intrinsic::amdgcn_global_load_tr6_b96:
1556 case Intrinsic::amdgcn_global_load_tr4_b64:
1557 case Intrinsic::amdgcn_global_load_tr_b64:
1558 case Intrinsic::amdgcn_global_load_tr_b128:
1559 case Intrinsic::amdgcn_ds_read_tr4_b64:
1560 case Intrinsic::amdgcn_ds_read_tr6_b96:
1561 case Intrinsic::amdgcn_ds_read_tr8_b64:
1562 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1570 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1571 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1572 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1580 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1581 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1582 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1590 case Intrinsic::amdgcn_ds_gws_init:
1591 case Intrinsic::amdgcn_ds_gws_barrier:
1592 case Intrinsic::amdgcn_ds_gws_sema_v:
1593 case Intrinsic::amdgcn_ds_gws_sema_br:
1594 case Intrinsic::amdgcn_ds_gws_sema_p:
1595 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1605 Info.memVT = MVT::i32;
1607 Info.align =
Align(4);
1609 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1615 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1616 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1617 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1618 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1619 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1620 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1621 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1622 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1629 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1630 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1631 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1632 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1639 case Intrinsic::amdgcn_load_to_lds:
1640 case Intrinsic::amdgcn_global_load_lds: {
1648 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1649 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1650 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1651 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1661 Info.memVT = MVT::i32;
1663 Info.align =
Align(4);
1668 case Intrinsic::amdgcn_s_prefetch_data:
1669 case Intrinsic::amdgcn_flat_prefetch:
1670 case Intrinsic::amdgcn_global_prefetch: {
1685 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1688 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1689 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1701 Type *&AccessTy)
const {
1703 switch (
II->getIntrinsicID()) {
1704 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1705 case Intrinsic::amdgcn_cluster_load_b128:
1706 case Intrinsic::amdgcn_cluster_load_b64:
1707 case Intrinsic::amdgcn_cluster_load_b32:
1708 case Intrinsic::amdgcn_ds_append:
1709 case Intrinsic::amdgcn_ds_consume:
1710 case Intrinsic::amdgcn_ds_load_tr8_b64:
1711 case Intrinsic::amdgcn_ds_load_tr16_b128:
1712 case Intrinsic::amdgcn_ds_load_tr4_b64:
1713 case Intrinsic::amdgcn_ds_load_tr6_b96:
1714 case Intrinsic::amdgcn_ds_read_tr4_b64:
1715 case Intrinsic::amdgcn_ds_read_tr6_b96:
1716 case Intrinsic::amdgcn_ds_read_tr8_b64:
1717 case Intrinsic::amdgcn_ds_read_tr16_b64:
1718 case Intrinsic::amdgcn_ds_ordered_add:
1719 case Intrinsic::amdgcn_ds_ordered_swap:
1720 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1721 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1722 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1723 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1724 case Intrinsic::amdgcn_flat_load_monitor_b128:
1725 case Intrinsic::amdgcn_flat_load_monitor_b32:
1726 case Intrinsic::amdgcn_flat_load_monitor_b64:
1727 case Intrinsic::amdgcn_global_atomic_csub:
1728 case Intrinsic::amdgcn_global_atomic_fmax_num:
1729 case Intrinsic::amdgcn_global_atomic_fmin_num:
1730 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1731 case Intrinsic::amdgcn_global_load_monitor_b128:
1732 case Intrinsic::amdgcn_global_load_monitor_b32:
1733 case Intrinsic::amdgcn_global_load_monitor_b64:
1734 case Intrinsic::amdgcn_global_load_tr_b64:
1735 case Intrinsic::amdgcn_global_load_tr_b128:
1736 case Intrinsic::amdgcn_global_load_tr4_b64:
1737 case Intrinsic::amdgcn_global_load_tr6_b96:
1738 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1739 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1740 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1741 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1742 Ptr =
II->getArgOperand(0);
1744 case Intrinsic::amdgcn_load_to_lds:
1745 case Intrinsic::amdgcn_global_load_lds:
1746 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1747 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1748 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1749 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1750 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1751 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1752 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1753 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1754 Ptr =
II->getArgOperand(1);
1759 AccessTy =
II->getType();
1765 unsigned AddrSpace)
const {
1766 if (!Subtarget->hasFlatInstOffsets()) {
1777 return AM.
Scale == 0 &&
1778 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1779 AM.
BaseOffs, AddrSpace, FlatVariant));
1783 if (Subtarget->hasFlatGlobalInsts())
1786 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1799 return isLegalMUBUFAddressingMode(AM);
1802bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1813 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1825 if (AM.HasBaseReg) {
1857 return isLegalMUBUFAddressingMode(AM);
1859 if (!Subtarget->hasScalarSubwordLoads()) {
1864 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1912 return Subtarget->enableFlatScratch()
1914 : isLegalMUBUFAddressingMode(AM);
1961 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1970 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
1973 Align RequiredAlignment(
1975 if (Subtarget->hasLDSMisalignedBug() &&
Size > 32 &&
1976 Alignment < RequiredAlignment)
1991 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
1997 RequiredAlignment =
Align(4);
1999 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2015 *IsFast = (Alignment >= RequiredAlignment) ? 64
2016 : (Alignment <
Align(4)) ? 32
2023 if (!Subtarget->hasDS96AndDS128())
2029 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2038 *IsFast = (Alignment >= RequiredAlignment) ? 96
2039 : (Alignment <
Align(4)) ? 32
2046 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2052 RequiredAlignment =
Align(8);
2054 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2063 *IsFast = (Alignment >= RequiredAlignment) ? 128
2064 : (Alignment <
Align(4)) ? 32
2081 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2083 return Alignment >= RequiredAlignment ||
2084 Subtarget->hasUnalignedDSAccessEnabled();
2092 bool AlignedBy4 = Alignment >=
Align(4);
2094 *IsFast = AlignedBy4;
2096 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
2105 return Alignment >=
Align(4) ||
2106 Subtarget->hasUnalignedBufferAccessEnabled();
2118 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2133 return Size >= 32 && Alignment >=
Align(4);
2138 unsigned *IsFast)
const {
2140 Alignment, Flags, IsFast);
2145 const AttributeList &FuncAttributes)
const {
2151 if (
Op.size() >= 16 &&
2155 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2173 unsigned DestAS)
const {
2176 Subtarget->hasGloballyAddressableScratch()) {
2206 unsigned Index)
const {
2222 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2249 auto [InputPtrReg, RC, ArgTy] =
2259 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2265 const SDLoc &SL)
const {
2272 const SDLoc &SL)
const {
2275 std::optional<uint32_t> KnownSize =
2277 if (KnownSize.has_value())
2303 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2312SDValue SITargetLowering::lowerKernargMemParameter(
2324 int64_t OffsetDiff =
Offset - AlignDownOffset;
2330 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2339 ArgVal = DAG.
getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2340 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2350 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2398 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2403SDValue SITargetLowering::getPreloadedValue(
2406 const ArgDescriptor *
Reg =
nullptr;
2407 const TargetRegisterClass *RC;
2411 const ArgDescriptor WorkGroupIDX =
2419 const ArgDescriptor WorkGroupIDZ =
2421 if (Subtarget->hasArchitectedSGPRs() &&
2426 Reg = &WorkGroupIDX;
2427 RC = &AMDGPU::SReg_32RegClass;
2431 Reg = &WorkGroupIDY;
2432 RC = &AMDGPU::SReg_32RegClass;
2436 Reg = &WorkGroupIDZ;
2437 RC = &AMDGPU::SReg_32RegClass;
2468 for (
unsigned I = 0,
E = Ins.size(), PSInputNum = 0;
I !=
E; ++
I) {
2472 "vector type argument should have been split");
2477 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2485 "unexpected vector split in ps argument type");
2499 Info->markPSInputAllocated(PSInputNum);
2501 Info->markPSInputEnabled(PSInputNum);
2517 if (Info.hasWorkItemIDX()) {
2523 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2527 if (Info.hasWorkItemIDY()) {
2528 assert(Info.hasWorkItemIDX());
2529 if (Subtarget->hasPackedTID()) {
2530 Info.setWorkItemIDY(
2533 unsigned Reg = AMDGPU::VGPR1;
2541 if (Info.hasWorkItemIDZ()) {
2542 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2543 if (Subtarget->hasPackedTID()) {
2544 Info.setWorkItemIDZ(
2547 unsigned Reg = AMDGPU::VGPR2;
2567 if (RegIdx == ArgVGPRs.
size()) {
2574 unsigned Reg = ArgVGPRs[RegIdx];
2586 unsigned NumArgRegs) {
2589 if (RegIdx == ArgSGPRs.
size())
2592 unsigned Reg = ArgSGPRs[RegIdx];
2634 const unsigned Mask = 0x3ff;
2637 if (Info.hasWorkItemIDX()) {
2639 Info.setWorkItemIDX(Arg);
2642 if (Info.hasWorkItemIDY()) {
2644 Info.setWorkItemIDY(Arg);
2647 if (Info.hasWorkItemIDZ())
2659 const unsigned Mask = 0x3ff;
2668 auto &
ArgInfo = Info.getArgInfo();
2680 if (Info.hasImplicitArgPtr())
2688 if (Info.hasWorkGroupIDX())
2691 if (Info.hasWorkGroupIDY())
2694 if (Info.hasWorkGroupIDZ())
2697 if (Info.hasLDSKernelId())
2708 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2709 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2715 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2716 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2721 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2722 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2728 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2734 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
2743 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2748 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
2749 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2754 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
2755 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2770 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2772 bool InPreloadSequence =
true;
2774 bool AlignedForImplictArgs =
false;
2775 unsigned ImplicitArgOffset = 0;
2776 for (
auto &Arg :
F.args()) {
2777 if (!InPreloadSequence || !Arg.hasInRegAttr())
2780 unsigned ArgIdx = Arg.getArgNo();
2783 if (InIdx < Ins.size() &&
2784 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2787 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2788 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2790 assert(ArgLocs[ArgIdx].isMemLoc());
2791 auto &ArgLoc = ArgLocs[InIdx];
2793 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2795 unsigned NumAllocSGPRs =
2796 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2799 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2800 if (!AlignedForImplictArgs) {
2802 alignTo(LastExplicitArgOffset,
2803 Subtarget->getAlignmentForImplicitArgPtr()) -
2804 LastExplicitArgOffset;
2805 AlignedForImplictArgs =
true;
2807 ArgOffset += ImplicitArgOffset;
2811 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2812 assert(InIdx >= 1 &&
"No previous SGPR");
2813 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2814 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2818 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2819 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2822 InPreloadSequence =
false;
2828 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2830 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2832 if (PreloadRegs->
size() > 1)
2833 RC = &AMDGPU::SGPR_32RegClass;
2834 for (
auto &Reg : *PreloadRegs) {
2840 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2849 if (Info.hasLDSKernelId()) {
2850 Register Reg = Info.addLDSKernelId();
2851 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2860 bool IsShader)
const {
2861 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2862 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2868 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2870 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2874 unsigned NumRequiredSystemSGPRs =
2875 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2876 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2877 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2878 Register Reg = Info.addReservedUserSGPR();
2879 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2884 if (!HasArchitectedSGPRs) {
2885 if (Info.hasWorkGroupIDX()) {
2886 Register Reg = Info.addWorkGroupIDX();
2887 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2891 if (Info.hasWorkGroupIDY()) {
2892 Register Reg = Info.addWorkGroupIDY();
2893 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2897 if (Info.hasWorkGroupIDZ()) {
2898 Register Reg = Info.addWorkGroupIDZ();
2899 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2904 if (Info.hasWorkGroupInfo()) {
2905 Register Reg = Info.addWorkGroupInfo();
2906 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2910 if (Info.hasPrivateSegmentWaveByteOffset()) {
2912 unsigned PrivateSegmentWaveByteOffsetReg;
2915 PrivateSegmentWaveByteOffsetReg =
2916 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2920 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2922 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2925 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2927 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2928 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2931 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2932 Info.getNumPreloadedSGPRs() >= 16);
2947 if (HasStackObjects)
2948 Info.setHasNonSpillStackObjects(
true);
2953 HasStackObjects =
true;
2957 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2959 if (!ST.enableFlatScratch()) {
2960 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2967 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2969 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2979 Info.setScratchRSrcReg(ReservedBufferReg);
2998 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2999 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3006 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3007 if (!
MRI.isLiveIn(
Reg)) {
3008 Info.setStackPtrOffsetReg(
Reg);
3013 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3020 if (ST.getFrameLowering()->hasFP(MF)) {
3021 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3037 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3046 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3047 RC = &AMDGPU::SGPR_64RegClass;
3048 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3049 RC = &AMDGPU::SGPR_32RegClass;
3055 Entry->addLiveIn(*
I);
3060 for (
auto *Exit : Exits)
3062 TII->get(TargetOpcode::COPY), *
I)
3077 bool IsError =
false;
3081 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3099 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3100 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3102 if (!Subtarget->enableFlatScratch())
3107 !Subtarget->hasArchitectedSGPRs())
3108 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3109 !Info->hasWorkGroupIDZ());
3112 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3130 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3131 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3134 Info->markPSInputAllocated(0);
3135 Info->markPSInputEnabled(0);
3137 if (Subtarget->isAmdPalOS()) {
3146 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3147 if ((PsInputBits & 0x7F) == 0 ||
3148 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3151 }
else if (IsKernel) {
3152 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3154 Splits.
append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3164 if (IsKernel && Subtarget->hasKernargPreload())
3168 }
else if (!IsGraphics) {
3173 if (!Subtarget->enableFlatScratch())
3185 Info->setNumWaveDispatchSGPRs(
3187 Info->setNumWaveDispatchVGPRs(
3189 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3190 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3195 if (IsWholeWaveFunc) {
3197 {MVT::i1, MVT::Other}, Chain);
3209 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3220 if (IsEntryFunc && VA.
isMemLoc()) {
3243 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3247 int64_t OffsetDiff =
Offset - AlignDownOffset;
3254 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3264 ArgVal = DAG.
getNode(ISD::BITCAST,
DL, MemVT, ArgVal);
3265 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3266 Ins[i].Flags.isSExt(), &Ins[i]);
3274 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3277 if (PreloadRegs.
size() == 1) {
3278 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3283 TRI->getRegSizeInBits(*RC)));
3291 for (
auto Reg : PreloadRegs) {
3298 PreloadRegs.size()),
3315 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3316 Ins[i].Flags.isSExt(), &Ins[i]);
3328 "hidden argument in kernel signature was not preloaded",
3334 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3335 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3355 if (!IsEntryFunc && VA.
isMemLoc()) {
3356 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3367 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3368 RC = &AMDGPU::VGPR_32RegClass;
3369 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3370 RC = &AMDGPU::SGPR_32RegClass;
3398 Val = DAG.
getNode(ISD::BITCAST,
DL, ValVT, Val);
3430 Info->setBytesInStackArgArea(StackArgSize);
3432 return Chains.
empty() ? Chain
3441 const Type *RetTy)
const {
3449 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3454 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3455 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3456 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3457 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3480 Info->setIfReturnsVoid(Outs.
empty());
3481 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3500 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3501 ++
I, ++RealRVLocIdx) {
3505 SDValue Arg = OutVals[RealRVLocIdx];
3528 ReadFirstLane, Arg);
3535 if (!Info->isEntryFunction()) {
3541 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3543 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3644 auto &ArgUsageInfo =
3646 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3672 const auto [OutgoingArg, ArgRC, ArgTy] =
3677 const auto [IncomingArg, IncomingArgRC, Ty] =
3679 assert(IncomingArgRC == ArgRC);
3682 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3690 InputReg = getImplicitArgPtr(DAG,
DL);
3692 std::optional<uint32_t> Id =
3694 if (Id.has_value()) {
3705 if (OutgoingArg->isRegister()) {
3706 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3707 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3710 unsigned SpecialArgOffset =
3721 auto [OutgoingArg, ArgRC, Ty] =
3724 std::tie(OutgoingArg, ArgRC, Ty) =
3727 std::tie(OutgoingArg, ArgRC, Ty) =
3742 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3743 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3744 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3749 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3757 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
3767 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
3776 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3777 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3788 : IncomingArgY ? *IncomingArgY
3795 if (OutgoingArg->isRegister()) {
3797 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3823 if (Callee->isDivergent())
3830 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3834 if (!CallerPreserved)
3837 bool CCMatch = CallerCC == CalleeCC;
3850 if (Arg.hasByValAttr())
3864 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3865 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3874 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3887 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
3889 if (!CCVA.isRegLoc())
3894 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3896 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
3920enum ChainCallArgIdx {
3942 bool UsesDynamicVGPRs =
false;
3943 if (IsChainCallConv) {
3948 auto RequestedExecIt =
3950 return Arg.OrigArgIndex == 2;
3952 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
3954 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
3957 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
3960 "Haven't popped all the special args");
3963 CLI.
Args[ChainCallArgIdx::Exec];
3964 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
3972 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
3974 ChainCallSpecialArgs.
push_back(Arg.Node);
3977 PushNodeOrTargetConstant(RequestedExecArg);
3983 if (FlagsValue.
isZero()) {
3984 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
3986 "no additional args allowed if flags == 0");
3988 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
3992 if (!Subtarget->isWave32()) {
3994 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
3997 UsesDynamicVGPRs =
true;
3998 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
3999 CLI.
Args.end(), PushNodeOrTargetConstant);
4008 bool IsSibCall =
false;
4022 "unsupported call to variadic function ");
4030 "unsupported required tail call to function ");
4035 Outs, OutVals, Ins, DAG);
4039 "site marked musttail or on llvm.amdgcn.cs.chain");
4046 if (!TailCallOpt && IsTailCall)
4086 auto *
TRI = Subtarget->getRegisterInfo();
4093 if (!IsSibCall || IsChainCallConv) {
4094 if (!Subtarget->enableFlatScratch()) {
4100 RegsToPass.emplace_back(IsChainCallConv
4101 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4102 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4109 const unsigned NumSpecialInputs = RegsToPass.size();
4111 MVT PtrVT = MVT::i32;
4114 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4142 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4150 int32_t
Offset = LocMemOffset;
4157 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4163 ? Flags.getNonZeroByValAlign()
4190 if (Outs[i].Flags.isByVal()) {
4192 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4195 Outs[i].Flags.getNonZeroByValAlign(),
4197 nullptr, std::nullopt, DstInfo,
4203 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4209 if (!MemOpChains.
empty())
4217 TokenGlue = DAG.
getNode(ISD::CONVERGENCECTRL_GLUE,
DL, MVT::Glue,
4225 unsigned ArgIdx = 0;
4226 for (
auto [Reg, Val] : RegsToPass) {
4227 if (ArgIdx++ >= NumSpecialInputs &&
4228 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4254 if (IsTailCall && !IsSibCall) {
4259 std::vector<SDValue>
Ops({Chain});
4265 Ops.push_back(Callee);
4282 Ops.push_back(Callee);
4293 if (IsChainCallConv)
4298 for (
auto &[Reg, Val] : RegsToPass)
4302 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4303 assert(Mask &&
"Missing call preserved mask for calling convention");
4313 MVT::Glue, GlueOps),
4318 Ops.push_back(InGlue);
4338 if (Info->isWholeWaveFunction())
4346 Chain =
Call.getValue(0);
4347 InGlue =
Call.getValue(1);
4349 uint64_t CalleePopBytes = NumBytes;
4370 EVT VT =
Op.getValueType();
4384 "Stack grows upwards for AMDGPU");
4386 Chain = BaseAddr.getValue(1);
4388 if (Alignment > StackAlign) {
4390 << Subtarget->getWavefrontSizeLog2();
4391 uint64_t StackAlignMask = ScaledAlignment - 1;
4398 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4404 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4415 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4431 if (
Op.getValueType() != MVT::i32)
4450 assert(
Op.getValueType() == MVT::i32);
4459 Op.getOperand(0), IntrinID, GetRoundBothImm);
4493 SDValue RoundModeTimesNumBits =
4513 TableEntry, EnumOffset);
4529 static_cast<uint32_t>(ConstMode->getZExtValue()),
4541 if (UseReducedTable) {
4547 SDValue RoundModeTimesNumBits =
4567 SDValue RoundModeTimesNumBits =
4576 NewMode = TruncTable;
4585 ReadFirstLaneID, NewMode);
4598 IntrinID, RoundBothImm, NewMode);
4604 if (
Op->isDivergent() &&
4605 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4615 if (Subtarget->hasSafeSmemPrefetch())
4623 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4632 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4633 EVT SrcVT = Src.getValueType();
4642 EVT DstVT =
Op.getValueType();
4646 return DAG.
getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4651 if (
Op.getValueType() != MVT::i64)
4665 Op.getOperand(0), IntrinID, ModeHwRegImm);
4667 Op.getOperand(0), IntrinID, TrapHwRegImm);
4674 SDValue Result = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4681 if (
Op.getOperand(1).getValueType() != MVT::i64)
4693 ReadFirstLaneID, NewModeReg);
4695 ReadFirstLaneID, NewTrapReg);
4697 unsigned ModeHwReg =
4700 unsigned TrapHwReg =
4708 IntrinID, ModeHwRegImm, NewModeReg);
4711 IntrinID, TrapHwRegImm, NewTrapReg);
4720 .
Case(
"m0", AMDGPU::M0)
4721 .
Case(
"exec", AMDGPU::EXEC)
4722 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4723 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4724 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4725 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4726 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4731 if (!Subtarget->hasFlatScrRegister() &&
4732 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4734 "\" for subtarget."));
4739 case AMDGPU::EXEC_LO:
4740 case AMDGPU::EXEC_HI:
4741 case AMDGPU::FLAT_SCR_LO:
4742 case AMDGPU::FLAT_SCR_HI:
4747 case AMDGPU::FLAT_SCR:
4766 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4775static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4797 auto Next = std::next(
I);
4808 MBB.addSuccessor(LoopBB);
4810 return std::pair(LoopBB, RemainderBB);
4817 auto I =
MI.getIterator();
4818 auto E = std::next(
I);
4840 Src->setIsKill(
false);
4850 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
4856 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4859 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4883 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4884 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4893 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4894 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4896 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4897 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4905 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4912 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4916 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4922 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4923 : AMDGPU::S_AND_SAVEEXEC_B64),
4927 MRI.setSimpleHint(NewExec, CondReg);
4929 if (UseGPRIdxMode) {
4931 SGPRIdxReg = CurrentIdxReg;
4933 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4934 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4944 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4951 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4954 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4955 : AMDGPU::S_XOR_B64_term),
4979 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4980 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4988 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
4990 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4991 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4992 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4993 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5008 InitResultReg, DstReg, PhiReg, TmpExec,
5009 Offset, UseGPRIdxMode, SGPRIdxReg);
5015 LoopBB->removeSuccessor(RemainderBB);
5017 LoopBB->addSuccessor(LandingPad);
5028static std::pair<unsigned, int>
5032 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5037 return std::pair(AMDGPU::sub0,
Offset);
5077 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5094 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5095 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5104 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5107 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5111 if (UseGPRIdxMode) {
5118 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5131 MI.eraseFromParent();
5140 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5141 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5147 UseGPRIdxMode, SGPRIdxReg);
5151 if (UseGPRIdxMode) {
5153 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5155 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5160 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5165 MI.eraseFromParent();
5182 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5192 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5194 if (Idx->
getReg() == AMDGPU::NoRegister) {
5205 MI.eraseFromParent();
5210 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5214 if (UseGPRIdxMode) {
5218 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5227 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5228 TRI.getRegSizeInBits(*VecRC), 32,
false);
5234 MI.eraseFromParent();
5244 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5248 UseGPRIdxMode, SGPRIdxReg);
5251 if (UseGPRIdxMode) {
5253 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5255 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5261 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5262 TRI.getRegSizeInBits(*VecRC), 32,
false);
5263 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5269 MI.eraseFromParent();
5275 case AMDGPU::S_MIN_U32:
5276 return std::numeric_limits<uint32_t>::max();
5277 case AMDGPU::S_MIN_I32:
5278 return std::numeric_limits<int32_t>::max();
5279 case AMDGPU::S_MAX_U32:
5280 return std::numeric_limits<uint32_t>::min();
5281 case AMDGPU::S_MAX_I32:
5282 return std::numeric_limits<int32_t>::min();
5283 case AMDGPU::S_ADD_I32:
5284 case AMDGPU::S_SUB_I32:
5285 case AMDGPU::S_OR_B32:
5286 case AMDGPU::S_XOR_B32:
5287 return std::numeric_limits<uint32_t>::min();
5288 case AMDGPU::S_AND_B32:
5289 return std::numeric_limits<uint32_t>::max();
5306 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5311 case AMDGPU::S_MIN_U32:
5312 case AMDGPU::S_MIN_I32:
5313 case AMDGPU::S_MAX_U32:
5314 case AMDGPU::S_MAX_I32:
5315 case AMDGPU::S_AND_B32:
5316 case AMDGPU::S_OR_B32: {
5322 case AMDGPU::S_XOR_B32:
5323 case AMDGPU::S_ADD_I32:
5324 case AMDGPU::S_SUB_I32: {
5327 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5328 Register ActiveLanes =
MRI.createVirtualRegister(DstRegClass);
5330 bool IsWave32 = ST.isWave32();
5331 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5332 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5334 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5339 auto NewAccumulator =
BuildMI(BB,
MI,
DL,
TII->get(CountReg), ActiveLanes)
5340 .
addReg(Exec->getOperand(0).getReg());
5343 case AMDGPU::S_XOR_B32: {
5348 Register ParityRegister =
MRI.createVirtualRegister(DstRegClass);
5352 .
addReg(NewAccumulator->getOperand(0).getReg())
5356 .
addReg(ParityReg->getOperand(0).getReg());
5359 case AMDGPU::S_SUB_I32: {
5360 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5363 auto InvertedValReg =
5368 .
addReg(InvertedValReg->getOperand(0).getReg())
5369 .
addReg(NewAccumulator->getOperand(0).getReg());
5372 case AMDGPU::S_ADD_I32: {
5375 .
addReg(NewAccumulator->getOperand(0).getReg());
5402 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5403 Register InitalValReg =
MRI.createVirtualRegister(DstRegClass);
5405 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5406 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5407 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5409 Register FF1Reg =
MRI.createVirtualRegister(DstRegClass);
5411 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5413 bool IsWave32 = ST.isWave32();
5414 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5415 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5422 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5430 I = ComputeLoop->end();
5432 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5436 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5437 .
addReg(TmpSReg->getOperand(0).getReg())
5441 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5442 auto FF1 =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(SFFOpc), FF1Reg)
5443 .
addReg(ActiveBits->getOperand(0).getReg());
5444 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5445 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5447 .
addReg(FF1->getOperand(0).getReg());
5450 .
addReg(LaneValue->getOperand(0).getReg());
5453 unsigned BITSETOpc =
5454 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5455 auto NewActiveBits =
5456 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5457 .
addReg(FF1->getOperand(0).getReg())
5458 .
addReg(ActiveBits->getOperand(0).getReg());
5461 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5462 .addMBB(ComputeLoop);
5463 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5464 .addMBB(ComputeLoop);
5467 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5469 .
addReg(NewActiveBits->getOperand(0).getReg())
5471 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5476 MI.eraseFromParent();
5488 switch (
MI.getOpcode()) {
5489 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5491 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5493 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5495 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5497 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5499 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5501 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5503 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5505 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5507 case AMDGPU::S_UADDO_PSEUDO:
5508 case AMDGPU::S_USUBO_PSEUDO: {
5515 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5517 : AMDGPU::S_SUB_I32;
5528 MI.eraseFromParent();
5531 case AMDGPU::S_ADD_U64_PSEUDO:
5532 case AMDGPU::S_SUB_U64_PSEUDO: {
5541 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5542 if (Subtarget->hasScalarAddSub64()) {
5543 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5553 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5554 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5557 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5559 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5562 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5564 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5566 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5567 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5580 MI.eraseFromParent();
5583 case AMDGPU::V_ADD_U64_PSEUDO:
5584 case AMDGPU::V_SUB_U64_PSEUDO: {
5590 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5596 if (ST.hasAddSubU64Insts()) {
5598 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5599 : AMDGPU::V_SUB_U64_e64),
5604 TII->legalizeOperands(*
I);
5605 MI.eraseFromParent();
5609 if (IsAdd && ST.hasLshlAddU64Inst()) {
5615 TII->legalizeOperands(*
Add);
5616 MI.eraseFromParent();
5620 const auto *CarryRC =
TRI->getWaveMaskRegClass();
5622 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5623 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5625 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
5626 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
5630 : &AMDGPU::VReg_64RegClass;
5633 : &AMDGPU::VReg_64RegClass;
5636 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5638 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5641 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5643 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5646 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5648 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5651 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5658 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5672 TII->legalizeOperands(*LoHalf);
5673 TII->legalizeOperands(*HiHalf);
5674 MI.eraseFromParent();
5677 case AMDGPU::S_ADD_CO_PSEUDO:
5678 case AMDGPU::S_SUB_CO_PSEUDO: {
5692 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5693 ? AMDGPU::S_ADDC_U32
5694 : AMDGPU::S_SUBB_U32;
5696 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5697 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5702 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5703 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5707 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5709 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5715 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
5716 assert(WaveSize == 64 || WaveSize == 32);
5718 if (WaveSize == 64) {
5719 if (ST.hasScalarCompareEq64()) {
5725 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5727 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5729 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5730 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5732 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
5753 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5759 MI.eraseFromParent();
5762 case AMDGPU::SI_INIT_M0: {
5765 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
5768 MI.eraseFromParent();
5771 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
5774 TII->get(AMDGPU::S_CMP_EQ_U32))
5779 case AMDGPU::GET_GROUPSTATICSIZE: {
5784 .
add(
MI.getOperand(0))
5786 MI.eraseFromParent();
5789 case AMDGPU::GET_SHADERCYCLESHILO: {
5804 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5806 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5807 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5809 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5810 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5812 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5816 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5821 .
add(
MI.getOperand(0))
5826 MI.eraseFromParent();
5829 case AMDGPU::SI_INDIRECT_SRC_V1:
5830 case AMDGPU::SI_INDIRECT_SRC_V2:
5831 case AMDGPU::SI_INDIRECT_SRC_V4:
5832 case AMDGPU::SI_INDIRECT_SRC_V8:
5833 case AMDGPU::SI_INDIRECT_SRC_V9:
5834 case AMDGPU::SI_INDIRECT_SRC_V10:
5835 case AMDGPU::SI_INDIRECT_SRC_V11:
5836 case AMDGPU::SI_INDIRECT_SRC_V12:
5837 case AMDGPU::SI_INDIRECT_SRC_V16:
5838 case AMDGPU::SI_INDIRECT_SRC_V32:
5840 case AMDGPU::SI_INDIRECT_DST_V1:
5841 case AMDGPU::SI_INDIRECT_DST_V2:
5842 case AMDGPU::SI_INDIRECT_DST_V4:
5843 case AMDGPU::SI_INDIRECT_DST_V8:
5844 case AMDGPU::SI_INDIRECT_DST_V9:
5845 case AMDGPU::SI_INDIRECT_DST_V10:
5846 case AMDGPU::SI_INDIRECT_DST_V11:
5847 case AMDGPU::SI_INDIRECT_DST_V12:
5848 case AMDGPU::SI_INDIRECT_DST_V16:
5849 case AMDGPU::SI_INDIRECT_DST_V32:
5851 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5852 case AMDGPU::SI_KILL_I1_PSEUDO:
5854 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5863 Register SrcCond =
MI.getOperand(3).getReg();
5865 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5866 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5867 const auto *CondRC =
TRI->getWaveMaskRegClass();
5868 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
5872 : &AMDGPU::VReg_64RegClass;
5875 : &AMDGPU::VReg_64RegClass;
5878 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5880 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5883 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5885 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5888 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5890 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5911 MI.eraseFromParent();
5914 case AMDGPU::SI_BR_UNDEF: {
5918 .
add(
MI.getOperand(0));
5920 MI.eraseFromParent();
5923 case AMDGPU::ADJCALLSTACKUP:
5924 case AMDGPU::ADJCALLSTACKDOWN: {
5931 case AMDGPU::SI_CALL_ISEL: {
5935 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
5938 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5944 MI.eraseFromParent();
5947 case AMDGPU::V_ADD_CO_U32_e32:
5948 case AMDGPU::V_SUB_CO_U32_e32:
5949 case AMDGPU::V_SUBREV_CO_U32_e32: {
5952 unsigned Opc =
MI.getOpcode();
5954 bool NeedClampOperand =
false;
5955 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
5957 NeedClampOperand =
true;
5961 if (
TII->isVOP3(*
I)) {
5966 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
5967 if (NeedClampOperand)
5970 TII->legalizeOperands(*
I);
5972 MI.eraseFromParent();
5975 case AMDGPU::V_ADDC_U32_e32:
5976 case AMDGPU::V_SUBB_U32_e32:
5977 case AMDGPU::V_SUBBREV_U32_e32:
5980 TII->legalizeOperands(
MI);
5982 case AMDGPU::DS_GWS_INIT:
5983 case AMDGPU::DS_GWS_SEMA_BR:
5984 case AMDGPU::DS_GWS_BARRIER:
5985 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
5987 case AMDGPU::DS_GWS_SEMA_V:
5988 case AMDGPU::DS_GWS_SEMA_P:
5989 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5997 case AMDGPU::S_SETREG_B32: {
6013 const unsigned SetMask = WidthMask <<
Offset;
6016 unsigned SetDenormOp = 0;
6017 unsigned SetRoundOp = 0;
6025 SetRoundOp = AMDGPU::S_ROUND_MODE;
6026 SetDenormOp = AMDGPU::S_DENORM_MODE;
6028 SetRoundOp = AMDGPU::S_ROUND_MODE;
6030 SetDenormOp = AMDGPU::S_DENORM_MODE;
6033 if (SetRoundOp || SetDenormOp) {
6036 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6037 unsigned ImmVal = Def->getOperand(1).getImm();
6051 MI.eraseFromParent();
6060 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6064 case AMDGPU::S_INVERSE_BALLOT_U32:
6065 case AMDGPU::S_INVERSE_BALLOT_U64:
6068 MI.setDesc(
TII->get(AMDGPU::COPY));
6070 case AMDGPU::ENDPGM_TRAP: {
6073 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6093 MI.eraseFromParent();
6096 case AMDGPU::SIMULATED_TRAP: {
6097 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6100 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6101 MI.eraseFromParent();
6104 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6105 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6111 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6112 Register OriginalExec = Setup->getOperand(0).getReg();
6114 MI.getOperand(0).setReg(OriginalExec);
6151 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6155 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6182 if (!Subtarget->hasMadMacF32Insts())
6183 return Subtarget->hasFastFMAF32();
6189 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6192 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6208 switch (Ty.getScalarSizeInBits()) {
6226 if (Ty.getScalarSizeInBits() == 16)
6228 if (Ty.getScalarSizeInBits() == 32)
6229 return Subtarget->hasMadMacF32Insts() &&
6239 EVT VT =
N->getValueType(0);
6241 return Subtarget->hasMadMacF32Insts() &&
6243 if (VT == MVT::f16) {
6244 return Subtarget->hasMadF16() &&
6259 unsigned Opc =
Op.getOpcode();
6260 EVT VT =
Op.getValueType();
6261 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6262 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6263 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6264 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6279 unsigned Opc =
Op.getOpcode();
6280 EVT VT =
Op.getValueType();
6281 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6282 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6283 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6284 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6285 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6286 VT == MVT::v32bf16);
6294 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6296 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6303 unsigned Opc =
Op.getOpcode();
6304 EVT VT =
Op.getValueType();
6305 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6306 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6307 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6308 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6309 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6310 VT == MVT::v32bf16);
6315 : std::pair(Op0, Op0);
6324 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
6326 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
6332 switch (
Op.getOpcode()) {
6336 return LowerBRCOND(
Op, DAG);
6338 return LowerRETURNADDR(
Op, DAG);
6341 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6342 "Load should return a value and a chain");
6346 EVT VT =
Op.getValueType();
6348 return lowerFSQRTF32(
Op, DAG);
6350 return lowerFSQRTF64(
Op, DAG);
6355 return LowerTrig(
Op, DAG);
6357 return LowerSELECT(
Op, DAG);
6359 return LowerFDIV(
Op, DAG);
6361 return LowerFFREXP(
Op, DAG);
6362 case ISD::ATOMIC_CMP_SWAP:
6363 return LowerATOMIC_CMP_SWAP(
Op, DAG);
6365 return LowerSTORE(
Op, DAG);
6369 return LowerGlobalAddress(MFI,
Op, DAG);
6372 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
6374 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
6376 return LowerINTRINSIC_VOID(
Op, DAG);
6377 case ISD::ADDRSPACECAST:
6378 return lowerADDRSPACECAST(
Op, DAG);
6380 return lowerINSERT_SUBVECTOR(
Op, DAG);
6382 return lowerINSERT_VECTOR_ELT(
Op, DAG);
6384 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
6386 return lowerVECTOR_SHUFFLE(
Op, DAG);
6388 return lowerSCALAR_TO_VECTOR(
Op, DAG);
6390 return lowerBUILD_VECTOR(
Op, DAG);
6393 return lowerFP_ROUND(
Op, DAG);
6395 return lowerTRAP(
Op, DAG);
6396 case ISD::DEBUGTRAP:
6397 return lowerDEBUGTRAP(
Op, DAG);
6406 return lowerFMINNUM_FMAXNUM(
Op, DAG);
6407 case ISD::FMINIMUMNUM:
6408 case ISD::FMAXIMUMNUM:
6409 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
6412 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
6415 return lowerFLDEXP(
Op, DAG);
6432 case ISD::FMINNUM_IEEE:
6433 case ISD::FMAXNUM_IEEE:
6440 return lowerFCOPYSIGN(
Op, DAG);
6442 return lowerMUL(
Op, DAG);
6445 return lowerXMULO(
Op, DAG);
6448 return lowerXMUL_LOHI(
Op, DAG);
6449 case ISD::DYNAMIC_STACKALLOC:
6451 case ISD::STACKSAVE:
6455 case ISD::SET_ROUNDING:
6459 case ISD::FP_EXTEND:
6462 case ISD::GET_FPENV:
6464 case ISD::SET_FPENV:
6481 EVT FittingLoadVT = LoadVT;
6506 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6510 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6513SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6516 bool IsIntrinsic)
const {
6519 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6520 EVT LoadVT =
M->getValueType(0);
6522 EVT EquivLoadVT = LoadVT;
6536 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
6540 M->getMemoryVT(),
M->getMemOperand());
6551 EVT LoadVT =
M->getValueType(0);
6557 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
6558 bool IsTFE =
M->getNumValues() == 3;
6571 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
6575 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
6576 M->getMemOperand(), DAG);
6580 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
6582 M->getMemOperand(), DAG);
6590 EVT VT =
N->getValueType(0);
6591 unsigned CondCode =
N->getConstantOperandVal(3);
6602 EVT CmpVT =
LHS.getValueType();
6603 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6604 unsigned PromoteOp =
6624 EVT VT =
N->getValueType(0);
6626 unsigned CondCode =
N->getConstantOperandVal(3);
6635 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
6636 Src0 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6637 Src1 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6653 EVT VT =
N->getValueType(0);
6660 Src.getOperand(1), Src.getOperand(2));
6671 Exec = AMDGPU::EXEC_LO;
6673 Exec = AMDGPU::EXEC;
6690 EVT VT =
N->getValueType(0);
6692 unsigned IID =
N->getConstantOperandVal(0);
6693 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6694 IID == Intrinsic::amdgcn_permlanex16;
6695 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6696 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6700 unsigned SplitSize = 32;
6701 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6702 ST->hasDPALU_DPP() &&
6710 case Intrinsic::amdgcn_permlane16:
6711 case Intrinsic::amdgcn_permlanex16:
6712 case Intrinsic::amdgcn_update_dpp:
6717 case Intrinsic::amdgcn_writelane:
6720 case Intrinsic::amdgcn_readlane:
6721 case Intrinsic::amdgcn_set_inactive:
6722 case Intrinsic::amdgcn_set_inactive_chain_arg:
6723 case Intrinsic::amdgcn_mov_dpp8:
6726 case Intrinsic::amdgcn_readfirstlane:
6727 case Intrinsic::amdgcn_permlane64:
6737 if (
SDNode *GL =
N->getGluedNode()) {
6738 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6739 GL = GL->getOperand(0).getNode();
6740 Operands.push_back(DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6749 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6750 IID == Intrinsic::amdgcn_mov_dpp8 ||
6751 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6752 Src1 =
N->getOperand(2);
6753 if (IID == Intrinsic::amdgcn_writelane ||
6754 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6755 Src2 =
N->getOperand(3);
6758 if (ValSize == SplitSize) {
6768 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6773 if (IID == Intrinsic::amdgcn_writelane) {
6778 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6780 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
6783 if (ValSize % SplitSize != 0)
6787 EVT VT =
N->getValueType(0);
6791 unsigned NumOperands =
N->getNumOperands();
6793 SDNode *GL =
N->getGluedNode();
6798 for (
unsigned i = 0; i != NE; ++i) {
6799 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6801 SDValue Operand =
N->getOperand(j);
6816 DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6831 if (SplitSize == 32) {
6833 return unrollLaneOp(LaneOp.
getNode());
6839 unsigned SubVecNumElt =
6843 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6844 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6848 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6853 if (IID == Intrinsic::amdgcn_writelane)
6858 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6859 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6860 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6861 EltIdx += SubVecNumElt;
6875 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6878 if (IID == Intrinsic::amdgcn_writelane)
6881 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6889 switch (
N->getOpcode()) {
6901 unsigned IID =
N->getConstantOperandVal(0);
6903 case Intrinsic::amdgcn_make_buffer_rsrc:
6904 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
6906 case Intrinsic::amdgcn_cvt_pkrtz: {
6912 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6915 case Intrinsic::amdgcn_cvt_pknorm_i16:
6916 case Intrinsic::amdgcn_cvt_pknorm_u16:
6917 case Intrinsic::amdgcn_cvt_pk_i16:
6918 case Intrinsic::amdgcn_cvt_pk_u16: {
6924 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6926 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6928 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6933 EVT VT =
N->getValueType(0);
6938 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6942 case Intrinsic::amdgcn_s_buffer_load: {
6948 if (!Subtarget->hasScalarSubwordLoads())
6954 EVT VT =
Op.getValueType();
6955 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
6967 if (!
Offset->isDivergent()) {
6986 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
6991 case Intrinsic::amdgcn_dead: {
6992 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7003 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7004 Results.push_back(Res.getOperand(
I));
7008 Results.push_back(Res.getValue(1));
7017 EVT VT =
N->getValueType(0);
7022 EVT SelectVT = NewVT;
7023 if (NewVT.
bitsLT(MVT::i32)) {
7026 SelectVT = MVT::i32;
7032 if (NewVT != SelectVT)
7038 if (
N->getValueType(0) != MVT::v2f16)
7042 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7050 if (
N->getValueType(0) != MVT::v2f16)
7054 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7062 if (
N->getValueType(0) != MVT::f16)
7077 if (U.get() !=
Value)
7080 if (U.getUser()->getOpcode() == Opcode)
7086unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
7089 case Intrinsic::amdgcn_if:
7091 case Intrinsic::amdgcn_else:
7093 case Intrinsic::amdgcn_loop:
7095 case Intrinsic::amdgcn_end_cf:
7115 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7142 SDNode *Intr = BRCOND.getOperand(1).getNode();
7155 assert(BR &&
"brcond missing unconditional branch user");
7159 unsigned CFNode = isCFIntrinsic(Intr);
7179 Ops.push_back(Target);
7202 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
7221 MVT VT =
Op.getSimpleValueType();
7224 if (
Op.getConstantOperandVal(0) != 0)
7228 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7230 if (
Info->isEntryFunction())
7247 return Op.getValueType().bitsLE(VT)
7255 EVT DstVT =
Op.getValueType();
7262 unsigned Opc =
Op.getOpcode();
7274 EVT SrcVT = Src.getValueType();
7275 EVT DstVT =
Op.getValueType();
7278 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
7281 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
7288 if (DstVT == MVT::f16) {
7293 if (!Subtarget->has16BitInsts()) {
7296 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7298 if (
Op->getFlags().hasApproximateFuncs()) {
7305 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7309 "custom lower FP_ROUND for f16 or bf16");
7310 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
7323 EVT VT =
Op.getValueType();
7325 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7326 bool IsIEEEMode =
Info->getMode().IEEE;
7335 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7342SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
7344 EVT VT =
Op.getValueType();
7346 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7347 bool IsIEEEMode =
Info->getMode().IEEE;
7352 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7360 EVT VT =
Op.getValueType();
7364 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7365 !Subtarget->hasMinimum3Maximum3F16() &&
7366 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7367 "should not need to widen f16 minimum/maximum to v2f16");
7381 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7389 EVT VT =
Op.getValueType();
7393 EVT ExpVT =
Exp.getValueType();
7394 if (ExpVT == MVT::i16)
7415 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
7418 return DAG.
getNode(ISD::FLDEXP,
DL, VT,
Op.getOperand(0), TruncExp);
7422 switch (
Op->getOpcode()) {
7452 DAGCombinerInfo &DCI)
const {
7453 const unsigned Opc =
Op.getOpcode();
7461 :
Op->getOperand(0).getValueType();
7464 if (DCI.isBeforeLegalizeOps() ||
7468 auto &DAG = DCI.DAG;
7474 LHS =
Op->getOperand(1);
7475 RHS =
Op->getOperand(2);
7477 LHS =
Op->getOperand(0);
7478 RHS =
Op->getOperand(1);
7517 if (MagVT == SignVT)
7524 SDValue SignAsInt32 = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7527 SDValue SignAsHalf16 = DAG.
getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7534 EVT VT =
Op.getValueType();
7540 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
7567 if (
Op->isDivergent())
7580 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7582 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7585 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7587 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7593 EVT VT =
Op.getValueType();
7600 const APInt &
C = RHSC->getAPIntValue();
7602 if (
C.isPowerOf2()) {
7604 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
7631 if (
Op->isDivergent()) {
7635 if (Subtarget->hasSMulHi()) {
7646 if (!Subtarget->isTrapHandlerEnabled() ||
7648 return lowerTrapEndpgm(
Op, DAG);
7650 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
7651 : lowerTrapHsaQueuePtr(
Op, DAG);
7661SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
7663 ImplicitParameter Param)
const {
7683 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
7686 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7689 if (UserSGPR == AMDGPU::NoRegister) {
7715 if (Subtarget->hasPrivEnabledTrap2NopBug())
7728 if (!Subtarget->isTrapHandlerEnabled() ||
7732 "debugtrap handler not supported",
7743SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
7745 if (Subtarget->hasApertureRegs()) {
7747 ? AMDGPU::SRC_SHARED_BASE
7748 : AMDGPU::SRC_PRIVATE_BASE;
7749 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
7750 !Subtarget->hasGloballyAddressableScratch()) &&
7751 "Cannot use src_private_base with globally addressable scratch!");
7774 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7783 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
7787 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7789 if (UserSGPR == AMDGPU::NoRegister) {
7823 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7834 const AMDGPUTargetMachine &TM =
7837 unsigned DestAS, SrcAS;
7839 bool IsNonNull =
false;
7841 SrcAS = ASC->getSrcAddressSpace();
7842 Src = ASC->getOperand(0);
7843 DestAS = ASC->getDestAddressSpace();
7846 Op.getConstantOperandVal(0) ==
7847 Intrinsic::amdgcn_addrspacecast_nonnull);
7848 Src =
Op->getOperand(1);
7849 SrcAS =
Op->getConstantOperandVal(2);
7850 DestAS =
Op->getConstantOperandVal(3);
7863 Subtarget->hasGloballyAddressableScratch()) {
7868 AMDGPU::S_MOV_B32, SL, MVT::i32,
7869 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
7877 unsigned NullVal = TM.getNullPointerValue(DestAS);
7892 Subtarget->hasGloballyAddressableScratch()) {
7901 if (Subtarget->isWave64())
7907 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
7910 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7915 AMDGPU::S_MOV_B64, SL, MVT::i64,
7916 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
7918 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
7920 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7922 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7928 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7940 Op.getValueType() == MVT::i64) {
7941 const SIMachineFunctionInfo *
Info =
7945 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7949 Src.getValueType() == MVT::i64)
7969 EVT InsVT =
Ins.getValueType();
7977 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
7982 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7984 MVT::i32, InsNumElts / 2);
7986 Vec = DAG.
getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7987 Ins = DAG.
getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7989 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
7991 if (InsNumElts == 2) {
8001 return DAG.
getNode(ISD::BITCAST, SL, VecVT, Vec);
8004 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
8027 if (NumElts == 4 && EltSize == 16 && KIdx) {
8035 SDValue LoVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8036 SDValue HiVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8038 unsigned Idx = KIdx->getZExtValue();
8039 bool InsertLo = Idx < 2;
8042 DAG.
getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8043 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8045 InsHalf = DAG.
getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8049 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8062 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
8090 return DAG.
getNode(ISD::BITCAST, SL, VecVT, BFI);
8097 EVT ResultVT =
Op.getValueType();
8110 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8113 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8117 if (VecSize == 128) {
8125 }
else if (VecSize == 256) {
8128 for (
unsigned P = 0;
P < 4; ++
P) {
8134 Parts[0], Parts[1]));
8136 Parts[2], Parts[3]));
8142 for (
unsigned P = 0;
P < 8; ++
P) {
8149 Parts[0], Parts[1], Parts[2], Parts[3]));
8152 Parts[4], Parts[5], Parts[6], Parts[7]));
8172 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8187 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8189 return DAG.
getNode(ISD::BITCAST, SL, ResultVT, Result);
8197 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8202 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8203 !(Mask[Elt + 1] & 1);
8209 EVT ResultVT =
Op.getValueType();
8212 const int NewSrcNumElts = 2;
8214 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
8230 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
8252 if (ShouldUseConsecutiveExtract &&
8255 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8256 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8268 if (Idx0 >= SrcNumElts) {
8273 if (Idx1 >= SrcNumElts) {
8278 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8279 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8287 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8288 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8293 if (SubVec0 != SubVec1) {
8294 NewMaskIdx1 += NewSrcNumElts;
8301 {NewMaskIdx0, NewMaskIdx1});
8306 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8307 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8308 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8309 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8328 EVT ResultVT =
Op.getValueType();
8344 EVT VT =
Op.getValueType();
8346 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8347 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
8356 return DAG.
getNode(ISD::BITCAST, SL, VT, ExtLo);
8365 return DAG.
getNode(ISD::BITCAST, SL, VT, ShlHi);
8372 return DAG.
getNode(ISD::BITCAST, SL, VT,
Or);
8381 for (
unsigned P = 0;
P < NumParts; ++
P) {
8383 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
8389 return DAG.
getNode(ISD::BITCAST, SL, VT, Blend);
8402 if (!Subtarget->isAmdHsaOS())
8462 EVT PtrVT =
Op.getValueType();
8464 const GlobalValue *GV = GSD->
getGlobal();
8478 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
8496 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8497 if (Subtarget->has64BitLiterals()) {
8528 MachinePointerInfo PtrInfo =
8556 SDValue Param = lowerKernargMemParameter(
8567 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
8575 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
8583 unsigned NumElts = Elts.
size();
8585 if (NumElts <= 12) {
8594 for (
unsigned i = 0; i < Elts.
size(); ++i) {
8600 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
8610 EVT SrcVT = Src.getValueType();
8631 bool Unpacked,
bool IsD16,
int DMaskPop,
8632 int NumVDataDwords,
bool IsAtomicPacked16Bit,
8636 EVT ReqRetVT = ResultTypes[0];
8638 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
8639 ? (ReqRetNumElts + 1) / 2
8642 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
8653 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
8664 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
8666 NumDataDwords - MaskPopDwords);
8671 EVT LegalReqRetVT = ReqRetVT;
8673 if (!
Data.getValueType().isInteger())
8675 Data.getValueType().changeTypeToInteger(),
Data);
8696 if (Result->getNumValues() == 1)
8703 SDValue *LWE,
bool &IsTexFail) {
8723 unsigned DimIdx,
unsigned EndIdx,
8724 unsigned NumGradients) {
8726 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
8734 if (((
I + 1) >= EndIdx) ||
8735 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
8736 I == DimIdx + NumGradients - 1))) {
8755 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8769 int NumVDataDwords = 0;
8770 bool AdjustRetType =
false;
8771 bool IsAtomicPacked16Bit =
false;
8774 const unsigned ArgOffset = WithChain ? 2 : 1;
8777 unsigned DMaskLanes = 0;
8779 if (BaseOpcode->Atomic) {
8780 VData =
Op.getOperand(2);
8782 IsAtomicPacked16Bit =
8783 (Intr->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8784 Intr->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8787 if (BaseOpcode->AtomicX2) {
8794 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8795 DMask = Is64Bit ? 0xf : 0x3;
8796 NumVDataDwords = Is64Bit ? 4 : 2;
8798 DMask = Is64Bit ? 0x3 : 0x1;
8799 NumVDataDwords = Is64Bit ? 2 : 1;
8802 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
8805 if (BaseOpcode->Store) {
8806 VData =
Op.getOperand(2);
8810 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8814 VData = handleD16VData(VData, DAG,
true);
8817 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
8818 }
else if (!BaseOpcode->NoReturn) {
8823 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8831 (!LoadVT.
isVector() && DMaskLanes > 1))
8837 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
8838 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
8839 NumVDataDwords = (DMaskLanes + 1) / 2;
8841 NumVDataDwords = DMaskLanes;
8843 AdjustRetType =
true;
8847 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
8854 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8855 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8857 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
8859 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8860 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8864 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
8870 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
8874 "Bias needs to be converted to 16 bit in A16 mode");
8879 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
8883 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
8884 "require 16 bit args for both gradients and addresses");
8889 if (!
ST->hasA16()) {
8890 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
8891 "support 16 bit addresses\n");
8901 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
8903 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8905 IntrOpcode = G16MappingInfo->
G16;
8928 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
8946 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
8947 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
8948 const bool UseNSA =
ST->hasNSAEncoding() &&
8949 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
8950 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
8951 const bool UsePartialNSA =
8952 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
8955 if (UsePartialNSA) {
8957 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8958 }
else if (!UseNSA) {
8965 if (!BaseOpcode->Sampler) {
8968 uint64_t UnormConst =
8969 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
8971 Unorm = UnormConst ? True : False;
8977 bool IsTexFail =
false;
8978 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8989 NumVDataDwords += 1;
8990 AdjustRetType =
true;
8995 if (AdjustRetType) {
8998 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9007 MVT::i32, NumVDataDwords)
9010 ResultTypes[0] = NewVT;
9011 if (ResultTypes.size() == 3) {
9015 ResultTypes.erase(&ResultTypes[1]);
9020 if (BaseOpcode->Atomic)
9027 if (BaseOpcode->Store || BaseOpcode->Atomic)
9028 Ops.push_back(VData);
9029 if (UsePartialNSA) {
9031 Ops.push_back(VAddr);
9035 Ops.push_back(VAddr);
9038 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9040 Ops.push_back(Rsrc);
9041 if (BaseOpcode->Sampler) {
9045 Ops.push_back(Samp);
9050 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9051 Ops.push_back(Unorm);
9053 Ops.push_back(IsA16 &&
9054 ST->hasFeature(AMDGPU::FeatureR128A16)
9058 Ops.push_back(IsA16 ? True : False);
9060 if (!Subtarget->hasGFX90AInsts())
9065 "TFE is not supported on this GPU",
DL.getDebugLoc()));
9068 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9071 Ops.push_back(DimInfo->
DA ? True : False);
9072 if (BaseOpcode->HasD16)
9073 Ops.push_back(IsD16 ? True : False);
9075 Ops.push_back(
Op.getOperand(0));
9077 int NumVAddrDwords =
9083 NumVDataDwords, NumVAddrDwords);
9084 }
else if (IsGFX11Plus) {
9086 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9087 : AMDGPU::MIMGEncGfx11Default,
9088 NumVDataDwords, NumVAddrDwords);
9089 }
else if (IsGFX10Plus) {
9091 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9092 : AMDGPU::MIMGEncGfx10Default,
9093 NumVDataDwords, NumVAddrDwords);
9095 if (Subtarget->hasGFX90AInsts()) {
9097 NumVDataDwords, NumVAddrDwords);
9101 "requested image instruction is not supported on this GPU",
9106 for (EVT VT : OrigResultTypes) {
9107 if (VT == MVT::Other)
9108 RetValues[Idx++] =
Op.getOperand(0);
9119 NumVDataDwords, NumVAddrDwords);
9122 NumVDataDwords, NumVAddrDwords);
9129 MachineMemOperand *MemRef = MemOp->getMemOperand();
9133 if (BaseOpcode->AtomicX2) {
9138 if (BaseOpcode->NoReturn)
9141 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9142 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9155 MachinePointerInfo(),
9160 if (!
Offset->isDivergent()) {
9167 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9176 !Subtarget->hasScalarDwordx3Loads()) {
9203 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9205 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
9209 unsigned NumLoads = 1;
9215 if (NumElts == 8 || NumElts == 16) {
9216 NumLoads = NumElts / 4;
9220 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
9225 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
9227 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
9228 for (
unsigned i = 0; i < NumLoads; ++i) {
9234 if (NumElts == 8 || NumElts == 16)
9242 if (!Subtarget->hasArchitectedSGPRs())
9284 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
9286 EVT VT =
Op.getValueType();
9288 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
9292 switch (IntrinsicID) {
9293 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9296 return getPreloadedValue(DAG, *MFI, VT,
9299 case Intrinsic::amdgcn_dispatch_ptr:
9300 case Intrinsic::amdgcn_queue_ptr: {
9301 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
9303 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
9308 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9311 return getPreloadedValue(DAG, *MFI, VT, RegID);
9313 case Intrinsic::amdgcn_implicitarg_ptr: {
9315 return getImplicitArgPtr(DAG,
DL);
9316 return getPreloadedValue(DAG, *MFI, VT,
9319 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9325 return getPreloadedValue(DAG, *MFI, VT,
9328 case Intrinsic::amdgcn_dispatch_id: {
9331 case Intrinsic::amdgcn_rcp:
9333 case Intrinsic::amdgcn_rsq:
9335 case Intrinsic::amdgcn_rsq_legacy:
9339 case Intrinsic::amdgcn_rcp_legacy:
9343 case Intrinsic::amdgcn_rsq_clamp: {
9354 return DAG.
getNode(ISD::FMAXNUM,
DL, VT, Tmp,
9357 case Intrinsic::r600_read_ngroups_x:
9358 if (Subtarget->isAmdHsaOS())
9361 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9364 case Intrinsic::r600_read_ngroups_y:
9365 if (Subtarget->isAmdHsaOS())
9368 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9371 case Intrinsic::r600_read_ngroups_z:
9372 if (Subtarget->isAmdHsaOS())
9375 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9378 case Intrinsic::r600_read_local_size_x:
9379 if (Subtarget->isAmdHsaOS())
9382 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9384 case Intrinsic::r600_read_local_size_y:
9385 if (Subtarget->isAmdHsaOS())
9388 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9390 case Intrinsic::r600_read_local_size_z:
9391 if (Subtarget->isAmdHsaOS())
9394 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9396 case Intrinsic::amdgcn_workgroup_id_x:
9397 return getPreloadedValue(DAG, *MFI, VT,
9399 case Intrinsic::amdgcn_workgroup_id_y:
9400 return getPreloadedValue(DAG, *MFI, VT,
9402 case Intrinsic::amdgcn_workgroup_id_z:
9403 return getPreloadedValue(DAG, *MFI, VT,
9405 case Intrinsic::amdgcn_wave_id:
9406 return lowerWaveID(DAG,
Op);
9407 case Intrinsic::amdgcn_lds_kernel_id: {
9409 return getLDSKernelId(DAG,
DL);
9410 return getPreloadedValue(DAG, *MFI, VT,
9413 case Intrinsic::amdgcn_workitem_id_x:
9414 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
9415 case Intrinsic::amdgcn_workitem_id_y:
9416 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
9417 case Intrinsic::amdgcn_workitem_id_z:
9418 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
9419 case Intrinsic::amdgcn_wavefrontsize:
9421 SDLoc(
Op), MVT::i32);
9422 case Intrinsic::amdgcn_s_buffer_load: {
9423 unsigned CPol =
Op.getConstantOperandVal(3);
9430 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
9431 Op.getOperand(3), DAG);
9433 case Intrinsic::amdgcn_fdiv_fast:
9434 return lowerFDIV_FAST(
Op, DAG);
9435 case Intrinsic::amdgcn_sin:
9438 case Intrinsic::amdgcn_cos:
9441 case Intrinsic::amdgcn_mul_u24:
9444 case Intrinsic::amdgcn_mul_i24:
9448 case Intrinsic::amdgcn_log_clamp: {
9454 case Intrinsic::amdgcn_fract:
9457 case Intrinsic::amdgcn_class:
9460 case Intrinsic::amdgcn_div_fmas:
9462 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9464 case Intrinsic::amdgcn_div_fixup:
9466 Op.getOperand(2),
Op.getOperand(3));
9468 case Intrinsic::amdgcn_div_scale: {
9481 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
9484 Denominator, Numerator);
9486 case Intrinsic::amdgcn_icmp: {
9488 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
9489 Op.getConstantOperandVal(2) == 0 &&
9494 case Intrinsic::amdgcn_fcmp: {
9497 case Intrinsic::amdgcn_ballot:
9499 case Intrinsic::amdgcn_fmed3:
9501 Op.getOperand(2),
Op.getOperand(3));
9502 case Intrinsic::amdgcn_fdot2:
9504 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9505 case Intrinsic::amdgcn_fmul_legacy:
9508 case Intrinsic::amdgcn_sffbh:
9510 case Intrinsic::amdgcn_sbfe:
9512 Op.getOperand(2),
Op.getOperand(3));
9513 case Intrinsic::amdgcn_ubfe:
9515 Op.getOperand(2),
Op.getOperand(3));
9516 case Intrinsic::amdgcn_cvt_pkrtz:
9517 case Intrinsic::amdgcn_cvt_pknorm_i16:
9518 case Intrinsic::amdgcn_cvt_pknorm_u16:
9519 case Intrinsic::amdgcn_cvt_pk_i16:
9520 case Intrinsic::amdgcn_cvt_pk_u16: {
9522 EVT VT =
Op.getValueType();
9525 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9527 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9529 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9531 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
9537 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
9540 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
9541 return DAG.
getNode(ISD::BITCAST,
DL, VT, Node);
9543 case Intrinsic::amdgcn_fmad_ftz:
9545 Op.getOperand(2),
Op.getOperand(3));
9547 case Intrinsic::amdgcn_if_break:
9549 Op->getOperand(1),
Op->getOperand(2)),
9552 case Intrinsic::amdgcn_groupstaticsize: {
9558 const GlobalValue *GV =
9564 case Intrinsic::amdgcn_is_shared:
9565 case Intrinsic::amdgcn_is_private: {
9568 DAG.
getNode(ISD::BITCAST,
DL, MVT::v2i32,
Op.getOperand(1));
9572 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9576 Subtarget->hasGloballyAddressableScratch()) {
9579 AMDGPU::S_MOV_B32,
DL, MVT::i32,
9580 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
9589 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
9592 case Intrinsic::amdgcn_perm:
9594 Op.getOperand(2),
Op.getOperand(3));
9595 case Intrinsic::amdgcn_reloc_constant: {
9605 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
9606 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
9607 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
9608 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
9609 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
9610 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
9611 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
9612 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
9613 if (
Op.getOperand(4).getValueType() == MVT::i32)
9619 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
9620 Op.getOperand(3), IndexKeyi32);
9622 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
9623 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
9624 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
9625 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
9626 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
9627 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
9628 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
9629 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
9630 if (
Op.getOperand(4).getValueType() == MVT::i64)
9636 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9637 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
9640 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
9641 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
9642 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
9643 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
9644 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
9645 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
9646 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
9649 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
9655 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9656 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9657 IndexKey, Op.getOperand(7),
9660 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
9661 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
9662 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
9663 if (
Op.getOperand(6).getValueType() == MVT::i32)
9669 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9670 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9671 IndexKeyi32, Op.getOperand(7)});
9673 case Intrinsic::amdgcn_addrspacecast_nonnull:
9674 return lowerADDRSPACECAST(
Op, DAG);
9675 case Intrinsic::amdgcn_readlane:
9676 case Intrinsic::amdgcn_readfirstlane:
9677 case Intrinsic::amdgcn_writelane:
9678 case Intrinsic::amdgcn_permlane16:
9679 case Intrinsic::amdgcn_permlanex16:
9680 case Intrinsic::amdgcn_permlane64:
9681 case Intrinsic::amdgcn_set_inactive:
9682 case Intrinsic::amdgcn_set_inactive_chain_arg:
9683 case Intrinsic::amdgcn_mov_dpp8:
9684 case Intrinsic::amdgcn_update_dpp:
9686 case Intrinsic::amdgcn_dead: {
9688 for (
const EVT ValTy :
Op.getNode()->values())
9693 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9695 return lowerImage(
Op, ImageDimIntr, DAG,
false);
9706 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
9712 unsigned NewOpcode)
const {
9716 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9717 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9735 M->getMemOperand());
9740 unsigned NewOpcode)
const {
9744 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9745 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9763 M->getMemOperand());
9768 unsigned IntrID =
Op.getConstantOperandVal(1);
9772 case Intrinsic::amdgcn_ds_ordered_add:
9773 case Intrinsic::amdgcn_ds_ordered_swap: {
9778 unsigned IndexOperand =
M->getConstantOperandVal(7);
9779 unsigned WaveRelease =
M->getConstantOperandVal(8);
9780 unsigned WaveDone =
M->getConstantOperandVal(9);
9782 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9783 IndexOperand &= ~0x3f;
9784 unsigned CountDw = 0;
9787 CountDw = (IndexOperand >> 24) & 0xf;
9788 IndexOperand &= ~(0xf << 24);
9790 if (CountDw < 1 || CountDw > 4) {
9793 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
9802 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
9805 if (WaveDone && !WaveRelease) {
9809 Fn,
"ds_ordered_count: wave_done requires wave_release",
9813 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9814 unsigned ShaderType =
9816 unsigned Offset0 = OrderedCountIndex << 2;
9817 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
9820 Offset1 |= (CountDw - 1) << 6;
9823 Offset1 |= ShaderType << 2;
9825 unsigned Offset = Offset0 | (Offset1 << 8);
9832 M->getVTList(),
Ops,
M->getMemoryVT(),
9833 M->getMemOperand());
9835 case Intrinsic::amdgcn_raw_buffer_load:
9836 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9837 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9838 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9839 case Intrinsic::amdgcn_raw_buffer_load_format:
9840 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9841 const bool IsFormat =
9842 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9843 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9845 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9846 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9860 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
9862 case Intrinsic::amdgcn_struct_buffer_load:
9863 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9864 case Intrinsic::amdgcn_struct_buffer_load_format:
9865 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9866 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9867 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9868 const bool IsFormat =
9869 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9870 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9872 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9873 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9888 case Intrinsic::amdgcn_raw_tbuffer_load:
9889 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9891 EVT LoadVT =
Op.getValueType();
9892 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9893 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9912 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
9915 case Intrinsic::amdgcn_struct_tbuffer_load:
9916 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9918 EVT LoadVT =
Op.getValueType();
9919 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9920 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9939 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
9942 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9943 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9945 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9946 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9947 return lowerStructBufferAtomicIntrin(
Op, DAG,
9949 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9950 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9952 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9953 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9954 return lowerStructBufferAtomicIntrin(
Op, DAG,
9956 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9957 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9959 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9960 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9961 return lowerStructBufferAtomicIntrin(
Op, DAG,
9963 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9964 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9966 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9967 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9969 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9970 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9972 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9973 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9975 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9976 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9978 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9979 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9981 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9982 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9984 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9985 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9987 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9988 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9990 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9991 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9993 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9994 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9996 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9997 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9999 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10000 return lowerRawBufferAtomicIntrin(
Op, DAG,
10002 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10003 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10004 return lowerStructBufferAtomicIntrin(
Op, DAG,
10006 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10007 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10009 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10010 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10012 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10013 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10014 return lowerStructBufferAtomicIntrin(
Op, DAG,
10016 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10017 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10018 return lowerStructBufferAtomicIntrin(
Op, DAG,
10020 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10021 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10022 return lowerStructBufferAtomicIntrin(
Op, DAG,
10024 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10025 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10026 return lowerStructBufferAtomicIntrin(
Op, DAG,
10028 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10029 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10031 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10032 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10034 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10035 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10037 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10038 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10040 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10041 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10043 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10044 return lowerStructBufferAtomicIntrin(
Op, DAG,
10047 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10048 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10049 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
10050 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10064 EVT VT =
Op.getValueType();
10068 Op->getVTList(),
Ops, VT,
10069 M->getMemOperand());
10071 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10072 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10073 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
10074 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
10088 EVT VT =
Op.getValueType();
10092 Op->getVTList(),
Ops, VT,
10093 M->getMemOperand());
10095 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10096 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10098 SDValue NodePtr =
M->getOperand(2);
10099 SDValue RayExtent =
M->getOperand(3);
10100 SDValue InstanceMask =
M->getOperand(4);
10101 SDValue RayOrigin =
M->getOperand(5);
10102 SDValue RayDir =
M->getOperand(6);
10104 SDValue TDescr =
M->getOperand(8);
10109 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10114 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10115 const unsigned NumVDataDwords = 10;
10116 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10118 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10119 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10120 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10124 Ops.push_back(NodePtr);
10127 {DAG.getBitcast(MVT::i32, RayExtent),
10128 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10129 Ops.push_back(RayOrigin);
10130 Ops.push_back(RayDir);
10131 Ops.push_back(Offsets);
10132 Ops.push_back(TDescr);
10133 Ops.push_back(
M->getChain());
10136 MachineMemOperand *MemRef =
M->getMemOperand();
10140 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10142 SDValue NodePtr =
M->getOperand(2);
10143 SDValue RayExtent =
M->getOperand(3);
10144 SDValue RayOrigin =
M->getOperand(4);
10145 SDValue RayDir =
M->getOperand(5);
10146 SDValue RayInvDir =
M->getOperand(6);
10147 SDValue TDescr =
M->getOperand(7);
10154 if (!Subtarget->hasGFX10_AEncoding()) {
10164 const unsigned NumVDataDwords = 4;
10165 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10166 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10167 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10170 const unsigned BaseOpcodes[2][2] = {
10171 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10172 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10173 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10177 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10178 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10179 : AMDGPU::MIMGEncGfx10NSA,
10180 NumVDataDwords, NumVAddrDwords);
10184 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10185 : AMDGPU::MIMGEncGfx10Default,
10186 NumVDataDwords, NumVAddrDwords);
10192 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
10195 if (Lanes[0].getValueSizeInBits() == 32) {
10196 for (
unsigned I = 0;
I < 3; ++
I)
10203 Ops.push_back(Lanes[2]);
10215 if (UseNSA && IsGFX11Plus) {
10216 Ops.push_back(NodePtr);
10218 Ops.push_back(RayOrigin);
10223 for (
unsigned I = 0;
I < 3; ++
I) {
10226 {DirLanes[I], InvDirLanes[I]})));
10230 Ops.push_back(RayDir);
10231 Ops.push_back(RayInvDir);
10238 Ops.push_back(NodePtr);
10241 packLanes(RayOrigin,
true);
10242 packLanes(RayDir,
true);
10243 packLanes(RayInvDir,
false);
10248 if (NumVAddrDwords > 12) {
10250 Ops.append(16 -
Ops.size(), Undef);
10256 Ops.push_back(MergedOps);
10259 Ops.push_back(TDescr);
10261 Ops.push_back(
M->getChain());
10264 MachineMemOperand *MemRef =
M->getMemOperand();
10268 case Intrinsic::amdgcn_global_atomic_fmin_num:
10269 case Intrinsic::amdgcn_global_atomic_fmax_num:
10270 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10271 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10278 unsigned Opcode = 0;
10280 case Intrinsic::amdgcn_global_atomic_fmin_num:
10281 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10282 Opcode = ISD::ATOMIC_LOAD_FMIN;
10285 case Intrinsic::amdgcn_global_atomic_fmax_num:
10286 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10287 Opcode = ISD::ATOMIC_LOAD_FMAX;
10293 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
10294 Ops,
M->getMemOperand());
10296 case Intrinsic::amdgcn_s_get_barrier_state:
10297 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10304 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10305 BarID = (BarID >> 4) & 0x3F;
10306 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10309 Ops.push_back(Chain);
10311 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10312 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10320 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
10328 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10329 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10330 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10334 EVT VT =
Op->getValueType(0);
10340 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10342 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10350SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
10357 EVT VT = VTList.
VTs[0];
10360 bool IsTFE = VTList.
NumVTs == 3;
10363 unsigned NumOpDWords = NumValueDWords + 1;
10365 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
10366 MachineMemOperand *OpDWordsMMO =
10368 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
10369 OpDWordsVT, OpDWordsMMO, DAG);
10374 NumValueDWords == 1
10383 if (!Subtarget->hasDwordx3LoadStores() &&
10384 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10388 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
10390 WidenedMemVT, WidenedMMO);
10400 bool ImageStore)
const {
10410 if (Subtarget->hasUnpackedD16VMem()) {
10424 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10435 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
10441 if ((NumElements % 2) == 1) {
10443 unsigned I = Elts.
size() / 2;
10459 if (NumElements == 3) {
10469 return DAG.
getNode(ISD::BITCAST,
DL, WidenedStoreVT, ZExt);
10480 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
10483 switch (IntrinsicID) {
10484 case Intrinsic::amdgcn_exp_compr: {
10485 if (!Subtarget->hasCompressedExport()) {
10488 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
10500 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src0),
10501 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src1),
10510 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10514 case Intrinsic::amdgcn_struct_tbuffer_store:
10515 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10517 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
10519 VData = handleD16VData(VData, DAG);
10520 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10521 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10539 M->getMemoryVT(),
M->getMemOperand());
10542 case Intrinsic::amdgcn_raw_tbuffer_store:
10543 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
10545 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
10547 VData = handleD16VData(VData, DAG);
10548 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10549 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10567 M->getMemoryVT(),
M->getMemOperand());
10570 case Intrinsic::amdgcn_raw_buffer_store:
10571 case Intrinsic::amdgcn_raw_ptr_buffer_store:
10572 case Intrinsic::amdgcn_raw_buffer_store_format:
10573 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
10574 const bool IsFormat =
10575 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
10576 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
10583 VData = handleD16VData(VData, DAG);
10593 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10594 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10614 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
10617 M->getMemoryVT(),
M->getMemOperand());
10620 case Intrinsic::amdgcn_struct_buffer_store:
10621 case Intrinsic::amdgcn_struct_ptr_buffer_store:
10622 case Intrinsic::amdgcn_struct_buffer_store_format:
10623 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
10624 const bool IsFormat =
10625 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
10626 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
10634 VData = handleD16VData(VData, DAG);
10644 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10645 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10664 EVT VDataType = VData.getValueType().getScalarType();
10666 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
10669 M->getMemoryVT(),
M->getMemOperand());
10671 case Intrinsic::amdgcn_raw_buffer_load_lds:
10672 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
10673 case Intrinsic::amdgcn_struct_buffer_load_lds:
10674 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
10675 if (!Subtarget->hasVMemToLDSLoad())
10679 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
10680 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
10681 unsigned OpOffset = HasVIndex ? 1 : 0;
10682 SDValue VOffset =
Op.getOperand(5 + OpOffset);
10684 unsigned Size =
Op->getConstantOperandVal(4);
10690 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
10691 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
10692 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
10693 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
10696 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
10697 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
10698 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
10699 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
10702 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
10703 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
10704 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
10705 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
10708 if (!Subtarget->hasLDSLoadB96_B128())
10710 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
10711 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
10712 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
10713 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
10716 if (!Subtarget->hasLDSLoadB96_B128())
10718 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
10719 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
10720 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
10721 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
10729 if (HasVIndex && HasVOffset)
10733 else if (HasVIndex)
10734 Ops.push_back(
Op.getOperand(5));
10735 else if (HasVOffset)
10736 Ops.push_back(VOffset);
10738 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10739 Ops.push_back(Rsrc);
10740 Ops.push_back(
Op.getOperand(6 + OpOffset));
10741 Ops.push_back(
Op.getOperand(7 + OpOffset));
10743 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
10756 MachineMemOperand *LoadMMO =
M->getMemOperand();
10761 MachinePointerInfo StorePtrI = LoadPtrI;
10785 case Intrinsic::amdgcn_load_to_lds:
10786 case Intrinsic::amdgcn_global_load_lds: {
10787 if (!Subtarget->hasVMemToLDSLoad())
10791 unsigned Size =
Op->getConstantOperandVal(4);
10796 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
10799 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
10802 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10805 if (!Subtarget->hasLDSLoadB96_B128())
10807 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10810 if (!Subtarget->hasLDSLoadB96_B128())
10812 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10828 if (
LHS->isDivergent())
10832 RHS.getOperand(0).getValueType() == MVT::i32) {
10835 VOffset =
RHS.getOperand(0);
10839 Ops.push_back(Addr);
10847 Ops.push_back(VOffset);
10850 Ops.push_back(
Op.getOperand(5));
10851 Ops.push_back(
Op.getOperand(6));
10856 MachineMemOperand *LoadMMO =
M->getMemOperand();
10858 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
10859 MachinePointerInfo StorePtrI = LoadPtrI;
10878 case Intrinsic::amdgcn_end_cf:
10880 Op->getOperand(2), Chain),
10882 case Intrinsic::amdgcn_s_barrier_init:
10883 case Intrinsic::amdgcn_s_barrier_signal_var: {
10890 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10891 ? AMDGPU::S_BARRIER_INIT_M0
10892 : AMDGPU::S_BARRIER_SIGNAL_M0;
10907 constexpr unsigned ShAmt = 16;
10914 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
10919 case Intrinsic::amdgcn_s_barrier_join: {
10928 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10931 unsigned BarID = (BarVal >> 4) & 0x3F;
10934 Ops.push_back(Chain);
10936 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10946 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
10952 case Intrinsic::amdgcn_s_prefetch_data: {
10955 return Op.getOperand(0);
10958 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10960 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
10967 Op->getVTList(),
Ops,
M->getMemoryVT(),
10968 M->getMemOperand());
10970 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
10971 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
10972 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
10981 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10983 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11008std::pair<SDValue, SDValue>
11038 unsigned Overflow = ImmOffset & ~MaxImm;
11039 ImmOffset -= Overflow;
11040 if ((int32_t)Overflow < 0) {
11041 Overflow += ImmOffset;
11046 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
11065void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
11067 Align Alignment)
const {
11069 SDLoc
DL(CombinedOffset);
11071 uint32_t
Imm =
C->getZExtValue();
11072 uint32_t SOffset, ImmOffset;
11073 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11083 uint32_t SOffset, ImmOffset;
11086 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
11094 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11103SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
11106 return MaybePointer;
11120 SDValue NumRecords =
Op->getOperand(3);
11123 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11126 std::optional<uint32_t> ConstStride = std::nullopt;
11128 ConstStride = ConstNode->getZExtValue();
11131 if (!ConstStride || *ConstStride != 0) {
11134 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
11145 NewHighHalf, NumRecords, Flags);
11146 SDValue RsrcPtr = DAG.
getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11155 bool IsTFE)
const {
11164 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
11179 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
11183 LoadVal = DAG.
getNode(ISD::BITCAST,
DL, LoadVT, LoadVal);
11193 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11197 Ops[1] = BufferStoreExt;
11202 M->getMemOperand());
11227 DAGCombinerInfo &DCI)
const {
11228 SelectionDAG &DAG = DCI.DAG;
11243 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
11250 "unexpected vector extload");
11263 "unexpected fp extload");
11281 DCI.AddToWorklist(Cvt.
getNode());
11286 DCI.AddToWorklist(Cvt.
getNode());
11289 Cvt = DAG.
getNode(ISD::BITCAST, SL, VT, Cvt);
11297 if (
Info.isEntryFunction())
11298 return Info.getUserSGPRInfo().hasFlatScratchInit();
11306 EVT MemVT =
Load->getMemoryVT();
11307 MachineMemOperand *MMO =
Load->getMemOperand();
11319 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11347 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
11348 "Custom lowering for non-i32 vectors hasn't been implemented.");
11351 unsigned AS =
Load->getAddressSpace();
11358 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
11362 !Subtarget->hasMultiDwordFlatScratchAddressing())
11372 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
11375 Alignment >=
Align(4) && NumElements < 32) {
11377 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11389 if (NumElements > 4)
11392 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11402 switch (Subtarget->getMaxPrivateElementSize()) {
11408 if (NumElements > 2)
11413 if (NumElements > 4)
11416 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11425 auto Flags =
Load->getMemOperand()->getFlags();
11427 Load->getAlign(), Flags, &
Fast) &&
11436 MemVT, *
Load->getMemOperand())) {
11445 EVT VT =
Op.getValueType();
11472 return DAG.
getNode(ISD::BITCAST,
DL, VT, Res);
11482 EVT VT =
Op.getValueType();
11483 const SDNodeFlags
Flags =
Op->getFlags();
11485 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
11491 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11494 if (CLHS->isExactlyValue(1.0)) {
11511 if (CLHS->isExactlyValue(-1.0)) {
11520 if (!AllowInaccurateRcp &&
11521 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
11535 EVT VT =
Op.getValueType();
11536 const SDNodeFlags
Flags =
Op->getFlags();
11538 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
11539 if (!AllowInaccurateDiv)
11560 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
11574 return DAG.
getNode(Opcode, SL, VTList,
11583 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
11597 return DAG.
getNode(Opcode, SL, VTList,
11603 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
11604 return FastLowered;
11607 EVT VT =
Op.getValueType();
11614 if (VT == MVT::bf16) {
11637 unsigned FMADOpCode =
11639 SDValue NegRHSExt = DAG.
getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
11644 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11646 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
11647 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11653 Tmp = DAG.
getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
11663 SDNodeFlags
Flags =
Op->getFlags();
11670 const APFloat K0Val(0x1p+96f);
11673 const APFloat K1Val(0x1p-32f);
11700 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
11701 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
11702 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
11707 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
11708 return FastLowered;
11714 SDNodeFlags
Flags =
Op->getFlags();
11715 Flags.setNoFPExcept(
true);
11723 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
11734 DAG.
getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
11736 using namespace AMDGPU::Hwreg;
11737 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
11741 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
11742 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
11745 const bool HasDynamicDenormals =
11751 if (!PreservesDenormals) {
11756 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
11759 if (HasDynamicDenormals) {
11763 SavedDenormMode =
SDValue(GetReg, 0);
11769 SDNode *EnableDenorm;
11770 if (Subtarget->hasDenormModeInst()) {
11771 const SDValue EnableDenormValue =
11778 const SDValue EnableDenormValue =
11780 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
11781 {EnableDenormValue,
BitField, Glue});
11791 ApproxRcp, One, NegDivScale0, Flags);
11794 ApproxRcp, Fma0, Flags);
11800 NumeratorScaled,
Mul, Flags);
11806 NumeratorScaled, Fma3, Flags);
11808 if (!PreservesDenormals) {
11809 SDNode *DisableDenorm;
11810 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
11814 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
11820 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
11821 const SDValue DisableDenormValue =
11822 HasDynamicDenormals
11827 AMDGPU::S_SETREG_B32, SL, MVT::Other,
11838 {Fma4, Fma1, Fma3, Scale},
Flags);
11844 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
11845 return FastLowered;
11853 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
11857 SDValue NegDivScale0 = DAG.
getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
11877 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
11886 SDValue Scale0BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
11887 SDValue Scale1BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
11913 EVT VT =
Op.getValueType();
11915 if (VT == MVT::f32)
11916 return LowerFDIV32(
Op, DAG);
11918 if (VT == MVT::f64)
11919 return LowerFDIV64(
Op, DAG);
11921 if (VT == MVT::f16 || VT == MVT::bf16)
11922 return LowerFDIV16(
Op, DAG);
11931 EVT ResultExpVT =
Op->getValueType(1);
11932 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11942 if (Subtarget->hasFractBug()) {
11960 EVT VT =
Store->getMemoryVT();
11962 if (VT == MVT::i1) {
11966 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
11970 Store->getValue().getValueType().getScalarType() == MVT::i32);
11972 unsigned AS =
Store->getAddressSpace();
11980 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
11984 !Subtarget->hasMultiDwordFlatScratchAddressing())
11991 if (NumElements > 4)
11994 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11998 VT, *
Store->getMemOperand()))
12004 switch (Subtarget->getMaxPrivateElementSize()) {
12008 if (NumElements > 2)
12012 if (NumElements > 4 ||
12013 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12021 auto Flags =
Store->getMemOperand()->getFlags();
12040 assert(!Subtarget->has16BitInsts());
12041 SDNodeFlags
Flags =
Op->getFlags();
12043 DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32,
Op.getOperand(0), Flags);
12055 SDNodeFlags
Flags =
Op->getFlags();
12056 MVT VT =
Op.getValueType().getSimpleVT();
12086 SDValue SqrtSNextDown = DAG.
getNode(ISD::BITCAST,
DL, VT, SqrtSNextDownInt);
12089 DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextDown, Flags);
12098 SDValue NegSqrtSNextUp = DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextUp, Flags);
12164 SDNodeFlags
Flags =
Op->getFlags();
12210 SqrtRet = DAG.
getNode(ISD::FLDEXP,
DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12227 EVT VT =
Op.getValueType();
12237 if (Subtarget->hasTrigReducedRange()) {
12244 switch (
Op.getOpcode()) {
12271 EVT VT =
Op.getValueType();
12279 Op->getVTList(),
Ops, VT,
12288SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
12289 DAGCombinerInfo &DCI)
const {
12290 EVT VT =
N->getValueType(0);
12292 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12295 SelectionDAG &DAG = DCI.DAG;
12299 EVT SrcVT = Src.getValueType();
12305 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12308 DCI.AddToWorklist(Cvt.
getNode());
12311 if (ScalarVT != MVT::f32) {
12323 DAGCombinerInfo &DCI)
const {
12330 if (SignOp.
getOpcode() == ISD::FP_EXTEND ||
12334 SelectionDAG &DAG = DCI.DAG;
12353 for (
unsigned I = 0;
I != NumElts; ++
I) {
12377 if (NewElts.
size() == 1)
12399 for (
unsigned I = 0;
I != NumElts; ++
I) {
12434SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
12436 DAGCombinerInfo &DCI)
const {
12454 SelectionDAG &DAG = DCI.DAG;
12467 AM.BaseOffs =
Offset.getSExtValue();
12472 EVT VT =
N->getValueType(0);
12478 Flags.setNoUnsignedWrap(
12479 N->getFlags().hasNoUnsignedWrap() &&
12489 switch (
N->getOpcode()) {
12500 DAGCombinerInfo &DCI)
const {
12501 SelectionDAG &DAG = DCI.DAG;
12508 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
12509 N->getMemoryVT(), DCI);
12513 NewOps[PtrIdx] = NewPtr;
12522 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
12523 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
12532SDValue SITargetLowering::splitBinaryBitConstantOp(
12536 uint32_t ValLo =
Lo_32(Val);
12537 uint32_t ValHi =
Hi_32(Val);
12544 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
12558 if (V.getValueType() != MVT::i1)
12560 switch (V.getOpcode()) {
12577 return V.getResNo() == 1;
12579 unsigned IntrinsicID = V.getConstantOperandVal(0);
12580 switch (IntrinsicID) {
12581 case Intrinsic::amdgcn_is_shared:
12582 case Intrinsic::amdgcn_is_private:
12599 if (!(
C & 0x000000ff))
12600 ZeroByteMask |= 0x000000ff;
12601 if (!(
C & 0x0000ff00))
12602 ZeroByteMask |= 0x0000ff00;
12603 if (!(
C & 0x00ff0000))
12604 ZeroByteMask |= 0x00ff0000;
12605 if (!(
C & 0xff000000))
12606 ZeroByteMask |= 0xff000000;
12607 uint32_t NonZeroByteMask = ~ZeroByteMask;
12608 if ((NonZeroByteMask &
C) != NonZeroByteMask)
12621 assert(V.getValueSizeInBits() == 32);
12623 if (V.getNumOperands() != 2)
12632 switch (V.getOpcode()) {
12637 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
12642 return (0x03020100 & ~ConstMask) | ConstMask;
12649 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
12655 return uint32_t(0x0c0c0c0c03020100ull >>
C);
12662 DAGCombinerInfo &DCI)
const {
12663 if (DCI.isBeforeLegalize())
12666 SelectionDAG &DAG = DCI.DAG;
12667 EVT VT =
N->getValueType(0);
12672 if (VT == MVT::i64 && CRHS) {
12674 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
12678 if (CRHS && VT == MVT::i32) {
12688 unsigned Shift = CShift->getZExtValue();
12690 unsigned Offset = NB + Shift;
12691 if ((
Offset & (Bits - 1)) == 0) {
12715 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
12730 if (
Y.getOpcode() != ISD::FABS ||
Y.getOperand(0) !=
X ||
12735 if (
X !=
LHS.getOperand(1))
12739 const ConstantFPSDNode *C1 =
12773 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
12774 LHS.getOperand(0) ==
LHS.getOperand(1))) {
12776 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
12777 :
Mask->getZExtValue() & OrdMask;
12798 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12801 if (LHSMask != ~0u && RHSMask != ~0u) {
12804 if (LHSMask > RHSMask) {
12811 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12812 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12815 if (!(LHSUsedLanes & RHSUsedLanes) &&
12818 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12824 uint32_t
Mask = LHSMask & RHSMask;
12825 for (
unsigned I = 0;
I < 32;
I += 8) {
12826 uint32_t ByteSel = 0xff <<
I;
12827 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
12828 Mask &= (0x0c <<
I) & 0xffffffff;
12833 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
12886static const std::optional<ByteProvider<SDValue>>
12888 unsigned Depth = 0) {
12891 return std::nullopt;
12893 if (
Op.getValueSizeInBits() < 8)
12894 return std::nullopt;
12896 if (
Op.getValueType().isVector())
12899 switch (
Op->getOpcode()) {
12911 NarrowVT = VTSign->getVT();
12914 return std::nullopt;
12917 if (SrcIndex >= NarrowByteWidth)
12918 return std::nullopt;
12926 return std::nullopt;
12928 uint64_t BitShift = ShiftOp->getZExtValue();
12930 if (BitShift % 8 != 0)
12931 return std::nullopt;
12933 SrcIndex += BitShift / 8;
12951static const std::optional<ByteProvider<SDValue>>
12953 unsigned StartingIndex = 0) {
12957 return std::nullopt;
12959 unsigned BitWidth =
Op.getScalarValueSizeInBits();
12961 return std::nullopt;
12963 return std::nullopt;
12965 bool IsVec =
Op.getValueType().isVector();
12966 switch (
Op.getOpcode()) {
12969 return std::nullopt;
12974 return std::nullopt;
12978 return std::nullopt;
12981 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
12982 return std::nullopt;
12983 if (!
LHS ||
LHS->isConstantZero())
12985 if (!
RHS ||
RHS->isConstantZero())
12987 return std::nullopt;
12992 return std::nullopt;
12996 return std::nullopt;
12998 uint32_t BitMask = BitMaskOp->getZExtValue();
13000 uint32_t IndexMask = 0xFF << (Index * 8);
13002 if ((IndexMask & BitMask) != IndexMask) {
13005 if (IndexMask & BitMask)
13006 return std::nullopt;
13015 return std::nullopt;
13019 if (!ShiftOp ||
Op.getValueType().isVector())
13020 return std::nullopt;
13022 uint64_t BitsProvided =
Op.getValueSizeInBits();
13023 if (BitsProvided % 8 != 0)
13024 return std::nullopt;
13026 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13028 return std::nullopt;
13030 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13031 uint64_t ByteShift = BitShift / 8;
13033 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13034 uint64_t BytesProvided = BitsProvided / 8;
13035 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13036 NewIndex %= BytesProvided;
13043 return std::nullopt;
13047 return std::nullopt;
13049 uint64_t BitShift = ShiftOp->getZExtValue();
13051 return std::nullopt;
13053 auto BitsProvided =
Op.getScalarValueSizeInBits();
13054 if (BitsProvided % 8 != 0)
13055 return std::nullopt;
13057 uint64_t BytesProvided = BitsProvided / 8;
13058 uint64_t ByteShift = BitShift / 8;
13063 return BytesProvided - ByteShift > Index
13071 return std::nullopt;
13075 return std::nullopt;
13077 uint64_t BitShift = ShiftOp->getZExtValue();
13078 if (BitShift % 8 != 0)
13079 return std::nullopt;
13080 uint64_t ByteShift = BitShift / 8;
13086 return Index < ByteShift
13089 Depth + 1, StartingIndex);
13098 return std::nullopt;
13106 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13108 if (NarrowBitWidth % 8 != 0)
13109 return std::nullopt;
13110 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13112 if (Index >= NarrowByteWidth)
13114 ? std::optional<ByteProvider<SDValue>>(
13122 return std::nullopt;
13126 if (NarrowByteWidth >= Index) {
13131 return std::nullopt;
13138 return std::nullopt;
13144 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13145 if (NarrowBitWidth % 8 != 0)
13146 return std::nullopt;
13147 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13152 if (Index >= NarrowByteWidth) {
13154 ? std::optional<ByteProvider<SDValue>>(
13159 if (NarrowByteWidth > Index) {
13163 return std::nullopt;
13168 return std::nullopt;
13171 Depth + 1, StartingIndex);
13177 return std::nullopt;
13178 auto VecIdx = IdxOp->getZExtValue();
13179 auto ScalarSize =
Op.getScalarValueSizeInBits();
13180 if (ScalarSize < 32)
13181 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13183 StartingIndex, Index);
13188 return std::nullopt;
13192 return std::nullopt;
13195 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13196 if (IdxMask > 0x07 && IdxMask != 0x0c)
13197 return std::nullopt;
13199 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13200 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13202 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
13208 return std::nullopt;
13223 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
13230 auto MemVT = L->getMemoryVT();
13233 return L->getMemoryVT().getSizeInBits() == 16;
13243 int Low8 = Mask & 0xff;
13244 int Hi8 = (Mask & 0xff00) >> 8;
13246 assert(Low8 < 8 && Hi8 < 8);
13248 bool IsConsecutive = (Hi8 - Low8 == 1);
13253 bool Is16Aligned = !(Low8 % 2);
13255 return IsConsecutive && Is16Aligned;
13263 int Low16 = PermMask & 0xffff;
13264 int Hi16 = (PermMask & 0xffff0000) >> 16;
13274 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13276 if (!OtherOpIs16Bit)
13284 unsigned DWordOffset) {
13289 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13294 if (Src.getValueType().isVector()) {
13295 auto ScalarTySize = Src.getScalarValueSizeInBits();
13296 auto ScalarTy = Src.getValueType().getScalarType();
13297 if (ScalarTySize == 32) {
13301 if (ScalarTySize > 32) {
13304 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13305 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13312 assert(ScalarTySize < 32);
13313 auto NumElements =
TypeSize / ScalarTySize;
13314 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13315 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13316 auto NumElementsIn32 = 32 / ScalarTySize;
13317 auto NumAvailElements = DWordOffset < Trunc32Elements
13319 : NumElements - NormalizedTrunc;
13332 auto ShiftVal = 32 * DWordOffset;
13340 [[maybe_unused]]
EVT VT =
N->getValueType(0);
13345 for (
int i = 0; i < 4; i++) {
13347 std::optional<ByteProvider<SDValue>>
P =
13350 if (!
P ||
P->isConstantZero())
13355 if (PermNodes.
size() != 4)
13358 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13359 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13361 for (
size_t i = 0; i < PermNodes.
size(); i++) {
13362 auto PermOp = PermNodes[i];
13365 int SrcByteAdjust = 4;
13369 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13370 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13372 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13373 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13377 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13378 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13381 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13383 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13386 SDValue Op = *PermNodes[FirstSrc.first].Src;
13388 assert(
Op.getValueSizeInBits() == 32);
13392 int Low16 = PermMask & 0xffff;
13393 int Hi16 = (PermMask & 0xffff0000) >> 16;
13395 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13396 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13399 if (WellFormedLow && WellFormedHi)
13403 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
13412 assert(
Op.getValueType().isByteSized() &&
13430 DAGCombinerInfo &DCI)
const {
13431 SelectionDAG &DAG = DCI.DAG;
13435 EVT VT =
N->getValueType(0);
13436 if (VT == MVT::i1) {
13441 if (Src !=
RHS.getOperand(0))
13446 if (!CLHS || !CRHS)
13450 static const uint32_t MaxMask = 0x3ff;
13470 Sel |=
LHS.getConstantOperandVal(2);
13479 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13483 auto usesCombinedOperand = [](SDNode *OrUse) {
13485 if (OrUse->getOpcode() != ISD::BITCAST ||
13486 !OrUse->getValueType(0).isVector())
13490 for (
auto *VUser : OrUse->users()) {
13491 if (!VUser->getValueType(0).isVector())
13498 if (VUser->getOpcode() == VectorwiseOp)
13504 if (!
any_of(
N->users(), usesCombinedOperand))
13510 if (LHSMask != ~0u && RHSMask != ~0u) {
13513 if (LHSMask > RHSMask) {
13520 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13521 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13524 if (!(LHSUsedLanes & RHSUsedLanes) &&
13527 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13529 LHSMask &= ~RHSUsedLanes;
13530 RHSMask &= ~LHSUsedLanes;
13532 LHSMask |= LHSUsedLanes & 0x04040404;
13534 uint32_t Sel = LHSMask | RHSMask;
13542 if (LHSMask == ~0u || RHSMask == ~0u) {
13548 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
13563 if (SrcVT == MVT::i32) {
13568 DCI.AddToWorklist(LowOr.
getNode());
13569 DCI.AddToWorklist(HiBits.getNode());
13573 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
13580 N->getOperand(0), CRHS))
13588 DAGCombinerInfo &DCI)
const {
13589 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
13596 SelectionDAG &DAG = DCI.DAG;
13598 EVT VT =
N->getValueType(0);
13599 if (CRHS && VT == MVT::i64) {
13601 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
13615 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(1));
13617 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(2));
13621 LHS->getOperand(0), FNegLHS, FNegRHS);
13622 return DAG.
getNode(ISD::BITCAST,
DL, VT, NewSelect);
13630 DAGCombinerInfo &DCI)
const {
13631 if (!Subtarget->has16BitInsts() ||
13635 EVT VT =
N->getValueType(0);
13636 if (VT != MVT::i32)
13640 if (Src.getValueType() != MVT::i16)
13647SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
13648 DAGCombinerInfo &DCI)
const {
13655 VTSign->getVT() == MVT::i8) ||
13657 VTSign->getVT() == MVT::i16))) {
13658 assert(Subtarget->hasScalarSubwordLoads() &&
13659 "s_buffer_load_{u8, i8} are supported "
13660 "in GFX12 (or newer) architectures.");
13661 EVT VT = Src.getValueType();
13666 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
13673 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
13674 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
13679 VTSign->getVT() == MVT::i8) ||
13681 VTSign->getVT() == MVT::i16)) &&
13690 Src.getOperand(6), Src.getOperand(7)};
13693 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
13697 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
13698 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
13699 return DCI.DAG.getMergeValues(
13700 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
13706 DAGCombinerInfo &DCI)
const {
13707 SelectionDAG &DAG = DCI.DAG;
13714 if (
N->getOperand(0).isUndef())
13721 DAGCombinerInfo &DCI)
const {
13722 EVT VT =
N->getValueType(0);
13737 if ((VT == MVT::f16 && N0.
getOpcode() == ISD::FSQRT) &&
13747 unsigned MaxDepth)
const {
13748 unsigned Opcode =
Op.getOpcode();
13753 const auto &
F = CFP->getValueAPF();
13754 if (
F.isNaN() &&
F.isSignaling())
13756 if (!
F.isDenormal())
13782 case ISD::FP_EXTEND:
13783 case ISD::FP16_TO_FP:
13784 case ISD::FP_TO_FP16:
13785 case ISD::BF16_TO_FP:
13786 case ISD::FP_TO_BF16:
13819 if (
Op.getValueType() == MVT::i32) {
13825 if (RHS->getZExtValue() == 0xffff0000) {
13835 return Op.getValueType().getScalarType() != MVT::f16;
13839 case ISD::FMINNUM_IEEE:
13840 case ISD::FMAXNUM_IEEE:
13841 case ISD::FMINIMUM:
13842 case ISD::FMAXIMUM:
13843 case ISD::FMINIMUMNUM:
13844 case ISD::FMAXIMUMNUM:
13856 if (Subtarget->supportsMinMaxDenormModes() ||
13866 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
13878 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
13905 if (
Op.getValueType() == MVT::i16) {
13908 TruncSrc.
getOpcode() == ISD::BITCAST &&
13916 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
13918 switch (IntrinsicID) {
13919 case Intrinsic::amdgcn_cvt_pkrtz:
13920 case Intrinsic::amdgcn_cubeid:
13921 case Intrinsic::amdgcn_frexp_mant:
13922 case Intrinsic::amdgcn_fdot2:
13923 case Intrinsic::amdgcn_rcp:
13924 case Intrinsic::amdgcn_rsq:
13925 case Intrinsic::amdgcn_rsq_clamp:
13926 case Intrinsic::amdgcn_rcp_legacy:
13927 case Intrinsic::amdgcn_rsq_legacy:
13928 case Intrinsic::amdgcn_trig_preop:
13929 case Intrinsic::amdgcn_tanh:
13930 case Intrinsic::amdgcn_log:
13931 case Intrinsic::amdgcn_exp2:
13932 case Intrinsic::amdgcn_sqrt:
13950 unsigned MaxDepth)
const {
13953 unsigned Opcode =
MI->getOpcode();
13955 if (Opcode == AMDGPU::G_FCANONICALIZE)
13958 std::optional<FPValueAndVReg> FCR;
13961 if (FCR->Value.isSignaling())
13963 if (!FCR->Value.isDenormal())
13974 case AMDGPU::G_FADD:
13975 case AMDGPU::G_FSUB:
13976 case AMDGPU::G_FMUL:
13977 case AMDGPU::G_FCEIL:
13978 case AMDGPU::G_FFLOOR:
13979 case AMDGPU::G_FRINT:
13980 case AMDGPU::G_FNEARBYINT:
13981 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13982 case AMDGPU::G_INTRINSIC_TRUNC:
13983 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13984 case AMDGPU::G_FMA:
13985 case AMDGPU::G_FMAD:
13986 case AMDGPU::G_FSQRT:
13987 case AMDGPU::G_FDIV:
13988 case AMDGPU::G_FREM:
13989 case AMDGPU::G_FPOW:
13990 case AMDGPU::G_FPEXT:
13991 case AMDGPU::G_FLOG:
13992 case AMDGPU::G_FLOG2:
13993 case AMDGPU::G_FLOG10:
13994 case AMDGPU::G_FPTRUNC:
13995 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13996 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13997 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13998 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13999 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14001 case AMDGPU::G_FNEG:
14002 case AMDGPU::G_FABS:
14003 case AMDGPU::G_FCOPYSIGN:
14005 case AMDGPU::G_FMINNUM:
14006 case AMDGPU::G_FMAXNUM:
14007 case AMDGPU::G_FMINNUM_IEEE:
14008 case AMDGPU::G_FMAXNUM_IEEE:
14009 case AMDGPU::G_FMINIMUM:
14010 case AMDGPU::G_FMAXIMUM:
14011 case AMDGPU::G_FMINIMUMNUM:
14012 case AMDGPU::G_FMAXIMUMNUM: {
14013 if (Subtarget->supportsMinMaxDenormModes() ||
14020 case AMDGPU::G_BUILD_VECTOR:
14025 case AMDGPU::G_INTRINSIC:
14026 case AMDGPU::G_INTRINSIC_CONVERGENT:
14028 case Intrinsic::amdgcn_fmul_legacy:
14029 case Intrinsic::amdgcn_fmad_ftz:
14030 case Intrinsic::amdgcn_sqrt:
14031 case Intrinsic::amdgcn_fmed3:
14032 case Intrinsic::amdgcn_sin:
14033 case Intrinsic::amdgcn_cos:
14034 case Intrinsic::amdgcn_log:
14035 case Intrinsic::amdgcn_exp2:
14036 case Intrinsic::amdgcn_log_clamp:
14037 case Intrinsic::amdgcn_rcp:
14038 case Intrinsic::amdgcn_rcp_legacy:
14039 case Intrinsic::amdgcn_rsq:
14040 case Intrinsic::amdgcn_rsq_clamp:
14041 case Intrinsic::amdgcn_rsq_legacy:
14042 case Intrinsic::amdgcn_div_scale:
14043 case Intrinsic::amdgcn_div_fmas:
14044 case Intrinsic::amdgcn_div_fixup:
14045 case Intrinsic::amdgcn_fract:
14046 case Intrinsic::amdgcn_cvt_pkrtz:
14047 case Intrinsic::amdgcn_cubeid:
14048 case Intrinsic::amdgcn_cubema:
14049 case Intrinsic::amdgcn_cubesc:
14050 case Intrinsic::amdgcn_cubetc:
14051 case Intrinsic::amdgcn_frexp_mant:
14052 case Intrinsic::amdgcn_fdot2:
14053 case Intrinsic::amdgcn_trig_preop:
14054 case Intrinsic::amdgcn_tanh:
14073 if (
C.isDenormal()) {
14087 if (
C.isSignaling()) {
14110SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
14111 DAGCombinerInfo &DCI)
const {
14112 SelectionDAG &DAG = DCI.DAG;
14114 EVT VT =
N->getValueType(0);
14123 EVT VT =
N->getValueType(0);
14124 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
14140 EVT EltVT =
Lo.getValueType();
14143 for (
unsigned I = 0;
I != 2; ++
I) {
14147 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14148 }
else if (
Op.isUndef()) {
14182 case ISD::FMAXNUM_IEEE:
14183 case ISD::FMAXIMUMNUM:
14185 case ISD::FMAXIMUM:
14192 case ISD::FMINNUM_IEEE:
14193 case ISD::FMINIMUMNUM:
14195 case ISD::FMINIMUM:
14221 if (!MinK || !MaxK)
14234 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14235 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14294 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
14300 if (
Info->getMode().DX10Clamp) {
14309 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14337 case ISD::FMINNUM_IEEE:
14338 case ISD::FMAXNUM_IEEE:
14339 case ISD::FMINIMUMNUM:
14340 case ISD::FMAXIMUMNUM:
14343 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
14345 case ISD::FMINIMUM:
14346 case ISD::FMAXIMUM:
14354 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
14363 DAGCombinerInfo &DCI)
const {
14364 SelectionDAG &DAG = DCI.DAG;
14396 if (
SDValue Med3 = performIntMed3ImmCombine(
14401 if (
SDValue Med3 = performIntMed3ImmCombine(
14407 if (
SDValue Med3 = performIntMed3ImmCombine(
14412 if (
SDValue Med3 = performIntMed3ImmCombine(
14422 if (((
Opc == ISD::FMINNUM && Op0.
getOpcode() == ISD::FMAXNUM) ||
14423 (
Opc == ISD::FMINNUM_IEEE && Op0.
getOpcode() == ISD::FMAXNUM_IEEE) ||
14424 (
Opc == ISD::FMINIMUMNUM && Op0.
getOpcode() == ISD::FMAXIMUMNUM) ||
14427 (VT == MVT::f32 || VT == MVT::f64 ||
14428 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14429 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14430 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14431 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14433 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1))
14440 const SDNodeFlags
Flags =
N->getFlags();
14441 if ((
Opc == ISD::FMINIMUM ||
Opc == ISD::FMAXIMUM) &&
14442 !Subtarget->hasIEEEMinimumMaximumInsts() &&
Flags.hasNoNaNs()) {
14444 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14445 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
14455 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
14456 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
14465 DAGCombinerInfo &DCI)
const {
14466 EVT VT =
N->getValueType(0);
14470 SelectionDAG &DAG = DCI.DAG;
14485 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
14489 if (
Info->getMode().DX10Clamp) {
14509 DAGCombinerInfo &DCI)
const {
14513 return DCI.DAG.getUNDEF(
N->getValueType(0));
14521 bool IsDivergentIdx,
14526 unsigned VecSize = EltSize * NumElem;
14529 if (VecSize <= 64 && EltSize < 32)
14538 if (IsDivergentIdx)
14542 unsigned NumInsts = NumElem +
14543 ((EltSize + 31) / 32) * NumElem ;
14547 if (Subtarget->useVGPRIndexMode())
14548 return NumInsts <= 16;
14552 if (Subtarget->hasMovrel())
14553 return NumInsts <= 15;
14559 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
14574SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
14575 DAGCombinerInfo &DCI)
const {
14581 EVT ResVT =
N->getValueType(0);
14600 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
14619 case ISD::FMAXNUM_IEEE:
14620 case ISD::FMINNUM_IEEE:
14621 case ISD::FMAXIMUM:
14622 case ISD::FMINIMUM: {
14628 DCI.AddToWorklist(Elt0.
getNode());
14629 DCI.AddToWorklist(Elt1.
getNode());
14651 if (!DCI.isBeforeLegalize())
14659 VecSize > 32 && VecSize % 32 == 0 && Idx) {
14662 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
14663 unsigned EltIdx = BitIndex / 32;
14664 unsigned LeftoverBitIdx = BitIndex % 32;
14668 DCI.AddToWorklist(Cast.
getNode());
14672 DCI.AddToWorklist(Elt.
getNode());
14675 DCI.AddToWorklist(Srl.
getNode());
14679 DCI.AddToWorklist(Trunc.
getNode());
14681 if (VecEltVT == ResVT) {
14682 return DAG.
getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
14693SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
14694 DAGCombinerInfo &DCI)
const {
14705 SelectionDAG &DAG = DCI.DAG;
14724 if (Src.getOpcode() == ISD::FP_EXTEND &&
14725 Src.getOperand(0).getValueType() == MVT::f16) {
14726 return Src.getOperand(0);
14730 APFloat Val = CFP->getValueAPF();
14731 bool LosesInfo =
true;
14741 DAGCombinerInfo &DCI)
const {
14742 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
14743 "combine only useful on gfx8");
14745 SDValue TruncSrc =
N->getOperand(0);
14746 EVT VT =
N->getValueType(0);
14747 if (VT != MVT::f16)
14754 SelectionDAG &DAG = DCI.DAG;
14782 return DAG.
getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
14785unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
14787 const SDNode *N1)
const {
14792 if (((VT == MVT::f32 &&
14794 (VT == MVT::f16 && Subtarget->hasMadF16() &&
14814 EVT VT =
N->getValueType(0);
14815 if (VT != MVT::i32 && VT != MVT::i64)
14821 unsigned Opc =
N->getOpcode();
14876 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
14895 DAGCombinerInfo &DCI)
const {
14898 SelectionDAG &DAG = DCI.DAG;
14899 EVT VT =
N->getValueType(0);
14909 if (!
N->isDivergent() && Subtarget->hasSMulHi())
14913 if (NumBits <= 32 || NumBits > 64)
14924 if (!Subtarget->hasFullRate64Ops()) {
14925 unsigned NumUsers = 0;
14926 for (SDNode *User :
LHS->
users()) {
14929 if (!
User->isAnyAdd())
14953 bool MulSignedLo =
false;
14954 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
14963 if (VT != MVT::i64) {
14986 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
14988 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
14989 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
14991 if (!MulLHSUnsigned32) {
14998 if (!MulRHSUnsigned32) {
15009 if (VT != MVT::i64)
15015SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
15016 DAGCombinerInfo &DCI)
const {
15026 SelectionDAG &DAG = DCI.DAG;
15041 unsigned Opcode =
N->getOpcode();
15042 if (Opcode == ISD::PTRADD)
15045 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
15056static std::optional<ByteProvider<SDValue>>
15059 if (!Byte0 || Byte0->isConstantZero()) {
15060 return std::nullopt;
15063 if (Byte1 && !Byte1->isConstantZero()) {
15064 return std::nullopt;
15070 unsigned FirstCs =
First & 0x0c0c0c0c;
15071 unsigned SecondCs = Second & 0x0c0c0c0c;
15072 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
15073 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15075 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15076 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15077 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15078 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15080 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15104 for (
int BPI = 0; BPI < 2; BPI++) {
15107 BPP = {Src1, Src0};
15109 unsigned ZeroMask = 0x0c0c0c0c;
15110 unsigned FMask = 0xFF << (8 * (3 - Step));
15112 unsigned FirstMask =
15113 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15114 unsigned SecondMask =
15115 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15119 int FirstGroup = -1;
15120 for (
int I = 0;
I < 2;
I++) {
15122 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
15123 return IterElt.SrcOp == *BPP.first.Src &&
15124 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15128 if (Match != Srcs.
end()) {
15129 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
15134 if (FirstGroup != -1) {
15136 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
15137 return IterElt.SrcOp == *BPP.second.Src &&
15138 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15141 if (Match != Srcs.
end()) {
15142 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
15144 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15152 unsigned ZeroMask = 0x0c0c0c0c;
15153 unsigned FMask = 0xFF << (8 * (3 - Step));
15157 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15161 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15170 if (Srcs.
size() == 1) {
15171 auto *Elt = Srcs.
begin();
15175 if (Elt->PermMask == 0x3020100)
15182 auto *FirstElt = Srcs.
begin();
15183 auto *SecondElt = std::next(FirstElt);
15190 auto FirstMask = FirstElt->PermMask;
15191 auto SecondMask = SecondElt->PermMask;
15193 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15194 unsigned FirstPlusFour = FirstMask | 0x04040404;
15197 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15209 FirstElt = std::next(SecondElt);
15210 if (FirstElt == Srcs.
end())
15213 SecondElt = std::next(FirstElt);
15216 if (SecondElt == Srcs.
end()) {
15222 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
15228 return Perms.
size() == 2
15234 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15235 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15236 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15237 EntryMask += ZeroMask;
15242 auto Opcode =
Op.getOpcode();
15248static std::optional<bool>
15259 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15262 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15264 assert(!(S0IsUnsigned && S0IsSigned));
15265 assert(!(S1IsUnsigned && S1IsSigned));
15273 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15279 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15280 return std::nullopt;
15292 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15293 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15298 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15304 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15305 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15306 return std::nullopt;
15312 DAGCombinerInfo &DCI)
const {
15313 SelectionDAG &DAG = DCI.DAG;
15314 EVT VT =
N->getValueType(0);
15320 if (Subtarget->hasMad64_32()) {
15321 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
15326 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
15330 if (VT == MVT::i64) {
15331 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15336 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15338 std::optional<bool> IsSigned;
15344 int ChainLength = 0;
15345 for (
int I = 0;
I < 4;
I++) {
15349 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15352 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15357 TempNode->getOperand(MulIdx), *Src0, *Src1,
15358 TempNode->getOperand(MulIdx)->getOperand(0),
15359 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15363 IsSigned = *IterIsSigned;
15364 if (*IterIsSigned != *IsSigned)
15367 auto AddIdx = 1 - MulIdx;
15370 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
15371 Src2s.
push_back(TempNode->getOperand(AddIdx));
15381 TempNode->getOperand(AddIdx), *Src0, *Src1,
15382 TempNode->getOperand(AddIdx)->getOperand(0),
15383 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15387 if (*IterIsSigned != *IsSigned)
15391 ChainLength =
I + 2;
15395 TempNode = TempNode->getOperand(AddIdx);
15397 ChainLength =
I + 1;
15398 if (TempNode->getNumOperands() < 2)
15400 LHS = TempNode->getOperand(0);
15401 RHS = TempNode->getOperand(1);
15404 if (ChainLength < 2)
15410 if (ChainLength < 4) {
15420 bool UseOriginalSrc =
false;
15421 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
15422 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
15423 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
15424 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
15425 SmallVector<unsigned, 4> SrcBytes;
15426 auto Src0Mask = Src0s.
begin()->PermMask;
15427 SrcBytes.
push_back(Src0Mask & 0xFF000000);
15428 bool UniqueEntries =
true;
15429 for (
auto I = 1;
I < 4;
I++) {
15430 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
15433 UniqueEntries =
false;
15439 if (UniqueEntries) {
15440 UseOriginalSrc =
true;
15442 auto *FirstElt = Src0s.
begin();
15446 auto *SecondElt = Src1s.
begin();
15448 SecondElt->DWordOffset);
15457 if (!UseOriginalSrc) {
15464 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
15467 : Intrinsic::amdgcn_udot4,
15477 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
15482 unsigned Opc =
LHS.getOpcode();
15494 auto Cond =
RHS.getOperand(0);
15499 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
15516 DAGCombinerInfo &DCI)
const {
15517 SelectionDAG &DAG = DCI.DAG;
15519 EVT VT =
N->getValueType(0);
15532 SDNodeFlags ShlFlags = N1->
getFlags();
15536 SDNodeFlags NewShlFlags =
15541 DCI.AddToWorklist(Inner.
getNode());
15548 if (Subtarget->hasMad64_32()) {
15549 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
15558 if (VT == MVT::i64) {
15559 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15567 if (
const GlobalAddressSDNode *GA =
15572 SDNodeFlags
Flags =
15575 DCI.AddToWorklist(Inner.
getNode());
15603 SDNodeFlags ReassocFlags =
15606 if (ZIsConstant != YIsConstant) {
15610 DCI.AddToWorklist(Inner.
getNode());
15618 assert(!YIsConstant && !ZIsConstant);
15620 if (!
X->isDivergent() &&
Y->isDivergent() !=
Z->isDivergent()) {
15629 if (
Y->isDivergent())
15632 DCI.AddToWorklist(UniformInner.
getNode());
15640 DAGCombinerInfo &DCI)
const {
15641 SelectionDAG &DAG = DCI.DAG;
15642 EVT VT =
N->getValueType(0);
15644 if (VT == MVT::i64) {
15645 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15649 if (VT != MVT::i32)
15658 unsigned Opc =
RHS.getOpcode();
15665 auto Cond =
RHS.getOperand(0);
15670 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
15688SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
15689 DAGCombinerInfo &DCI)
const {
15691 if (
N->getValueType(0) != MVT::i32)
15697 SelectionDAG &DAG = DCI.DAG;
15702 unsigned LHSOpc =
LHS.getOpcode();
15703 unsigned Opc =
N->getOpcode();
15707 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
15713 DAGCombinerInfo &DCI)
const {
15717 SelectionDAG &DAG = DCI.DAG;
15718 EVT VT =
N->getValueType(0);
15730 if (
A ==
LHS.getOperand(1)) {
15731 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
15732 if (FusedOp != 0) {
15734 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
15742 if (
A ==
RHS.getOperand(1)) {
15743 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
15744 if (FusedOp != 0) {
15746 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
15755 DAGCombinerInfo &DCI)
const {
15759 SelectionDAG &DAG = DCI.DAG;
15761 EVT VT =
N->getValueType(0);
15774 if (
A ==
LHS.getOperand(1)) {
15775 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
15776 if (FusedOp != 0) {
15780 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
15789 if (
A ==
RHS.getOperand(1)) {
15790 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
15791 if (FusedOp != 0) {
15793 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
15802 DAGCombinerInfo &DCI)
const {
15803 SelectionDAG &DAG = DCI.DAG;
15805 EVT VT =
N->getValueType(0);
15806 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
15812 SDNodeFlags
Flags =
N->getFlags();
15813 SDNodeFlags RHSFlags =
RHS->getFlags();
15819 bool IsNegative =
false;
15820 if (CLHS->isExactlyValue(1.0) ||
15821 (IsNegative = CLHS->isExactlyValue(-1.0))) {
15824 if (
RHS.getOpcode() == ISD::FSQRT) {
15828 return IsNegative ? DAG.
getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
15837 DAGCombinerInfo &DCI)
const {
15838 SelectionDAG &DAG = DCI.DAG;
15839 EVT VT =
N->getValueType(0);
15843 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
15844 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
15859 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
15864 const ConstantFPSDNode *FalseNode =
15874 if (ScalarVT == MVT::f32 &&
15880 if (TrueNodeExpVal == INT_MIN)
15883 if (FalseNodeExpVal == INT_MIN)
15896 return DAG.
getNode(ISD::FLDEXP, SL, VT,
LHS, SelectNode,
N->getFlags());
15903 DAGCombinerInfo &DCI)
const {
15904 SelectionDAG &DAG = DCI.DAG;
15905 EVT VT =
N->getValueType(0);
15908 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
15926 (
N->getFlags().hasAllowContract() &&
15927 FMA->getFlags().hasAllowContract())) {
15942 if (FMAOp1.
getOpcode() != ISD::FP_EXTEND ||
15961 if (Vec1 == Vec2 || Vec3 == Vec4)
15967 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
15976 DAGCombinerInfo &DCI)
const {
15977 SelectionDAG &DAG = DCI.DAG;
15982 EVT VT =
LHS.getValueType();
16011 return LHS.getOperand(0);
16019 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
16026 const APInt &CT =
LHS.getConstantOperandAPInt(1);
16027 const APInt &CF =
LHS.getConstantOperandAPInt(2);
16035 return LHS.getOperand(0);
16039 if (VT != MVT::f32 && VT != MVT::f64 &&
16040 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16048 LHS.getOpcode() == ISD::FABS) {
16055 const unsigned IsInfMask =
16057 const unsigned IsFiniteMask =
16071SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
16072 DAGCombinerInfo &DCI)
const {
16073 SelectionDAG &DAG = DCI.DAG;
16094 unsigned ShiftOffset = 8 *
Offset;
16096 ShiftOffset -=
C->getZExtValue();
16098 ShiftOffset +=
C->getZExtValue();
16100 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16102 MVT::f32, Shifted);
16113 DCI.AddToWorklist(
N);
16120 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
16126 DAGCombinerInfo &DCI)
const {
16131 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16135 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16136 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
16139 APFloat One(
F.getSemantics(),
"1.0");
16141 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
16147 DAGCombinerInfo &DCI)
const {
16168 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
16169 bool isInteger =
LHS.getValueType().isInteger();
16172 if (!isFloatingPoint && !isInteger)
16177 if (!isEquality && !isNonEquality)
16194 if (isFloatingPoint) {
16196 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16207 if (!(isEquality && TrueVal == ConstVal) &&
16208 !(isNonEquality && FalseVal == ConstVal))
16215 SelectLHS, SelectRHS);
16220 switch (
N->getOpcode()) {
16236 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
16246 switch (
N->getOpcode()) {
16248 return performAddCombine(
N, DCI);
16250 return performPtrAddCombine(
N, DCI);
16252 return performSubCombine(
N, DCI);
16255 return performAddCarrySubCarryCombine(
N, DCI);
16257 return performFAddCombine(
N, DCI);
16259 return performFSubCombine(
N, DCI);
16261 return performFDivCombine(
N, DCI);
16263 return performFMulCombine(
N, DCI);
16265 return performSetCCCombine(
N, DCI);
16267 if (
auto Res = performSelectCombine(
N, DCI))
16272 case ISD::FMAXNUM_IEEE:
16273 case ISD::FMINNUM_IEEE:
16274 case ISD::FMAXIMUM:
16275 case ISD::FMINIMUM:
16276 case ISD::FMAXIMUMNUM:
16277 case ISD::FMINIMUMNUM:
16284 return performMinMaxCombine(
N, DCI);
16286 return performFMACombine(
N, DCI);
16288 return performAndCombine(
N, DCI);
16290 return performOrCombine(
N, DCI);
16293 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
16294 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16300 return performXorCombine(
N, DCI);
16302 return performZeroExtendCombine(
N, DCI);
16304 return performSignExtendInRegCombine(
N, DCI);
16306 return performClassCombine(
N, DCI);
16308 return performFCanonicalizeCombine(
N, DCI);
16310 return performRcpCombine(
N, DCI);
16325 return performUCharToFloatCombine(
N, DCI);
16327 return performFCopySignCombine(
N, DCI);
16332 return performCvtF32UByteNCombine(
N, DCI);
16334 return performFMed3Combine(
N, DCI);
16336 return performCvtPkRTZCombine(
N, DCI);
16338 return performClampCombine(
N, DCI);
16341 EVT VT =
N->getValueType(0);
16344 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16347 EVT EltVT = Src.getValueType();
16348 if (EltVT != MVT::i16)
16349 Src = DAG.
getNode(ISD::BITCAST, SL, MVT::i16, Src);
16352 return DAG.
getNode(ISD::BITCAST, SL, VT, Ext);
16358 return performExtractVectorEltCombine(
N, DCI);
16360 return performInsertVectorEltCombine(
N, DCI);
16362 return performFPRoundCombine(
N, DCI);
16371 return performMemSDNodeCombine(MemNode, DCI);
16402 unsigned Opcode =
Node->getMachineOpcode();
16405 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16406 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
16409 SDNode *
Users[5] = {
nullptr};
16411 unsigned DmaskIdx =
16412 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16413 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
16414 unsigned NewDmask = 0;
16415 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16416 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16417 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
16418 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
16419 unsigned TFCLane = 0;
16420 bool HasChain =
Node->getNumValues() > 1;
16422 if (OldDmask == 0) {
16430 TFCLane = OldBitsSet;
16434 for (SDUse &Use :
Node->uses()) {
16437 if (
Use.getResNo() != 0)
16440 SDNode *
User =
Use.getUser();
16443 if (!
User->isMachineOpcode() ||
16444 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
16456 if (UsesTFC && Lane == TFCLane) {
16461 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
16463 Dmask &= ~(1 << Comp);
16471 NewDmask |= 1 << Comp;
16476 bool NoChannels = !NewDmask;
16483 if (OldBitsSet == 1)
16489 if (NewDmask == OldDmask)
16498 unsigned NewChannels = BitsSet + UsesTFC;
16502 assert(NewOpcode != -1 &&
16503 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
16504 "failed to find equivalent MIMG op");
16512 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
16514 MVT ResultVT = NewChannels == 1
16517 : NewChannels == 5 ? 8
16519 SDVTList NewVTList =
16522 MachineSDNode *NewNode =
16531 if (NewChannels == 1) {
16541 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
16546 if (i || !NoChannels)
16551 if (NewUser != User) {
16561 Idx = AMDGPU::sub1;
16564 Idx = AMDGPU::sub2;
16567 Idx = AMDGPU::sub3;
16570 Idx = AMDGPU::sub4;
16581 Op =
Op.getOperand(0);
16602 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
16606 Node->getOperand(0), SL, VReg, SrcVal,
16612 return ToResultReg.
getNode();
16617 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
16619 Ops.push_back(
Node->getOperand(i));
16625 Node->getOperand(i).getValueType(),
16626 Node->getOperand(i)),
16638 unsigned Opcode =
Node->getMachineOpcode();
16640 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
16641 !
TII->isGather4(Opcode) &&
16643 return adjustWritemask(
Node, DAG);
16646 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
16652 case AMDGPU::V_DIV_SCALE_F32_e64:
16653 case AMDGPU::V_DIV_SCALE_F64_e64: {
16663 (Src0 == Src1 || Src0 == Src2))
16719 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
16720 unsigned InitIdx = 0;
16722 if (
TII->isImage(
MI)) {
16730 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
16731 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
16732 unsigned D16Val = D16 ? D16->getImm() : 0;
16734 if (!TFEVal && !LWEVal)
16745 assert(MO_Dmask &&
"Expected dmask operand in instruction");
16747 unsigned dmask = MO_Dmask->
getImm();
16752 bool Packed = !Subtarget->hasUnpackedD16VMem();
16754 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
16760 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
16761 if (DstSize < InitIdx)
16764 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
16772 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
16773 unsigned NewDst = 0;
16778 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
16779 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
16782 for (; SizeLeft; SizeLeft--, CurrIdx++) {
16783 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
16803 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
16816 if (
TII->isVOP3(
MI.getOpcode())) {
16818 TII->legalizeOperandsVOP3(
MRI,
MI);
16823 if (!
MI.getDesc().operands().empty()) {
16824 unsigned Opc =
MI.getOpcode();
16825 bool HasAGPRs = Info->mayNeedAGPRs();
16827 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
16829 {AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src0),
16830 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1), Src2Idx}) {
16833 if ((
I == Src2Idx) && (HasAGPRs))
16836 if (!
Op.isReg() || !
Op.getReg().isVirtual())
16838 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
16839 if (!
TRI->hasAGPRs(RC))
16841 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
16842 if (!Src || !Src->isCopy() ||
16843 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
16845 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
16849 MRI.setRegClass(
Op.getReg(), NewRC);
16852 if (
TII->isMAI(
MI)) {
16857 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
16858 AMDGPU::OpName::scale_src0);
16859 if (Src0Idx != -1) {
16860 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
16861 AMDGPU::OpName::scale_src1);
16862 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
16863 TII->usesConstantBus(
MRI,
MI, Src1Idx))
16864 TII->legalizeOpWithMove(
MI, Src1Idx);
16872 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
16873 if (Src2->isReg() && Src2->getReg().isVirtual()) {
16874 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
16875 if (
TRI->isVectorSuperClass(RC)) {
16876 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
16877 MRI.setRegClass(Src2->getReg(), NewRC);
16878 if (Src2->isTied())
16879 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
16888 if (
TII->isImage(
MI))
16889 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
16963std::pair<unsigned, const TargetRegisterClass *>
16970 if (Constraint.
size() == 1) {
16974 if (VT == MVT::Other)
16977 switch (Constraint[0]) {
16984 RC = &AMDGPU::SReg_32RegClass;
16987 RC = &AMDGPU::SGPR_64RegClass;
16992 return std::pair(0U,
nullptr);
16999 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17000 : &AMDGPU::VGPR_32_Lo256RegClass;
17003 RC = Subtarget->has1024AddressableVGPRs()
17004 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
17007 return std::pair(0U,
nullptr);
17012 if (!Subtarget->hasMAIInsts())
17016 RC = &AMDGPU::AGPR_32RegClass;
17021 return std::pair(0U,
nullptr);
17026 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
17030 RC = &AMDGPU::AV_32RegClass;
17033 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
17035 return std::pair(0U,
nullptr);
17044 return std::pair(0U, RC);
17047 if (Kind !=
'\0') {
17049 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17050 }
else if (Kind ==
's') {
17051 RC = &AMDGPU::SGPR_32RegClass;
17052 }
else if (Kind ==
'a') {
17053 RC = &AMDGPU::AGPR_32RegClass;
17059 return std::pair(0U,
nullptr);
17065 return std::pair(0U,
nullptr);
17069 RC =
TRI->getVGPRClassForBitWidth(Width);
17071 RC =
TRI->getSGPRClassForBitWidth(Width);
17073 RC =
TRI->getAGPRClassForBitWidth(Width);
17075 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17080 return std::pair(0U,
nullptr);
17082 return std::pair(Reg, RC);
17088 return std::pair(0U,
nullptr);
17089 if (Idx < RC->getNumRegs())
17091 return std::pair(0U,
nullptr);
17097 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
17103 if (Constraint.
size() == 1) {
17104 switch (Constraint[0]) {
17114 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
17122 if (Constraint.
size() == 1) {
17123 switch (Constraint[0]) {
17131 }
else if (Constraint.
size() == 2) {
17132 if (Constraint ==
"VA")
17150 std::vector<SDValue> &
Ops,
17165 unsigned Size =
Op.getScalarValueSizeInBits();
17169 if (
Size == 16 && !Subtarget->has16BitInsts())
17173 Val =
C->getSExtValue();
17177 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17181 if (
Size != 16 ||
Op.getNumOperands() != 2)
17183 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
17186 Val =
C->getSExtValue();
17190 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17200 if (Constraint.
size() == 1) {
17201 switch (Constraint[0]) {
17216 }
else if (Constraint.
size() == 2) {
17217 if (Constraint ==
"DA") {
17218 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
17219 int64_t LoBits =
static_cast<int32_t
>(Val);
17223 if (Constraint ==
"DB") {
17231 unsigned MaxSize)
const {
17232 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
17233 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17235 MVT VT =
Op.getSimpleValueType();
17260 switch (UnalignedClassID) {
17261 case AMDGPU::VReg_64RegClassID:
17262 return AMDGPU::VReg_64_Align2RegClassID;
17263 case AMDGPU::VReg_96RegClassID:
17264 return AMDGPU::VReg_96_Align2RegClassID;
17265 case AMDGPU::VReg_128RegClassID:
17266 return AMDGPU::VReg_128_Align2RegClassID;
17267 case AMDGPU::VReg_160RegClassID:
17268 return AMDGPU::VReg_160_Align2RegClassID;
17269 case AMDGPU::VReg_192RegClassID:
17270 return AMDGPU::VReg_192_Align2RegClassID;
17271 case AMDGPU::VReg_224RegClassID:
17272 return AMDGPU::VReg_224_Align2RegClassID;
17273 case AMDGPU::VReg_256RegClassID:
17274 return AMDGPU::VReg_256_Align2RegClassID;
17275 case AMDGPU::VReg_288RegClassID:
17276 return AMDGPU::VReg_288_Align2RegClassID;
17277 case AMDGPU::VReg_320RegClassID:
17278 return AMDGPU::VReg_320_Align2RegClassID;
17279 case AMDGPU::VReg_352RegClassID:
17280 return AMDGPU::VReg_352_Align2RegClassID;
17281 case AMDGPU::VReg_384RegClassID:
17282 return AMDGPU::VReg_384_Align2RegClassID;
17283 case AMDGPU::VReg_512RegClassID:
17284 return AMDGPU::VReg_512_Align2RegClassID;
17285 case AMDGPU::VReg_1024RegClassID:
17286 return AMDGPU::VReg_1024_Align2RegClassID;
17287 case AMDGPU::AReg_64RegClassID:
17288 return AMDGPU::AReg_64_Align2RegClassID;
17289 case AMDGPU::AReg_96RegClassID:
17290 return AMDGPU::AReg_96_Align2RegClassID;
17291 case AMDGPU::AReg_128RegClassID:
17292 return AMDGPU::AReg_128_Align2RegClassID;
17293 case AMDGPU::AReg_160RegClassID:
17294 return AMDGPU::AReg_160_Align2RegClassID;
17295 case AMDGPU::AReg_192RegClassID:
17296 return AMDGPU::AReg_192_Align2RegClassID;
17297 case AMDGPU::AReg_256RegClassID:
17298 return AMDGPU::AReg_256_Align2RegClassID;
17299 case AMDGPU::AReg_512RegClassID:
17300 return AMDGPU::AReg_512_Align2RegClassID;
17301 case AMDGPU::AReg_1024RegClassID:
17302 return AMDGPU::AReg_1024_Align2RegClassID;
17318 if (Info->isEntryFunction()) {
17325 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17327 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17328 :
TRI->getAlignedHighSGPRForRC(MF, 2,
17329 &AMDGPU::SGPR_64RegClass);
17330 Info->setSGPRForEXECCopy(SReg);
17332 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
17333 Info->getStackPtrOffsetReg()));
17334 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17335 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17339 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17340 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17342 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17343 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17345 Info->limitOccupancy(MF);
17347 if (ST.isWave32() && !MF.
empty()) {
17348 for (
auto &
MBB : MF) {
17349 for (
auto &
MI :
MBB) {
17350 TII->fixImplicitOperands(
MI);
17360 if (ST.needsAlignedVGPRs()) {
17361 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
17367 if (NewClassID != -1)
17368 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
17377 const APInt &DemandedElts,
17379 unsigned Depth)
const {
17381 unsigned Opc =
Op.getOpcode();
17384 unsigned IID =
Op.getConstantOperandVal(0);
17386 case Intrinsic::amdgcn_mbcnt_lo:
17387 case Intrinsic::amdgcn_mbcnt_hi: {
17393 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17403 Op, Known, DemandedElts, DAG,
Depth);
17419 unsigned MaxValue =
17426 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
17430 unsigned Src1Cst = 0;
17431 if (Src1.
isImm()) {
17432 Src1Cst = Src1.
getImm();
17433 }
else if (Src1.
isReg()) {
17437 Src1Cst = Cst->Value.getZExtValue();
17448 if (Width >= BFEWidth)
17457 Known = Known.
sext(BFEWidth);
17459 Known = Known.
zext(BFEWidth);
17465 unsigned Depth)
const {
17468 switch (
MI->getOpcode()) {
17469 case AMDGPU::S_BFE_I32:
17472 case AMDGPU::S_BFE_U32:
17475 case AMDGPU::S_BFE_I64:
17478 case AMDGPU::S_BFE_U64:
17481 case AMDGPU::G_INTRINSIC:
17482 case AMDGPU::G_INTRINSIC_CONVERGENT: {
17485 case Intrinsic::amdgcn_workitem_id_x:
17488 case Intrinsic::amdgcn_workitem_id_y:
17491 case Intrinsic::amdgcn_workitem_id_z:
17494 case Intrinsic::amdgcn_mbcnt_lo:
17495 case Intrinsic::amdgcn_mbcnt_hi: {
17507 case Intrinsic::amdgcn_groupstaticsize: {
17518 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
17521 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
17524 case AMDGPU::G_AMDGPU_SMED3:
17525 case AMDGPU::G_AMDGPU_UMED3: {
17526 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
17553 unsigned Depth)
const {
17560 AttributeList Attrs =
17562 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
17589 if (Header->getAlignment() != PrefAlign)
17590 return Header->getAlignment();
17592 unsigned LoopSize = 0;
17597 LoopSize +=
MBB->getAlignment().value() / 2;
17600 LoopSize +=
TII->getInstSizeInBytes(
MI);
17601 if (LoopSize > 192)
17606 if (LoopSize <= 64)
17609 if (LoopSize <= 128)
17610 return CacheLineAlign;
17616 auto I = Exit->getFirstNonDebugInstr();
17617 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
17618 return CacheLineAlign;
17627 if (PreTerm == Pre->
begin() ||
17628 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
17632 auto ExitHead = Exit->getFirstNonDebugInstr();
17633 if (ExitHead == Exit->end() ||
17634 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
17639 return CacheLineAlign;
17647 N =
N->getOperand(0).getNode();
17648 if (
N->getOpcode() == ISD::INLINEASM ||
N->getOpcode() == ISD::INLINEASM_BR)
17657 switch (
N->getOpcode()) {
17665 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
17666 return !
TRI->isSGPRReg(
MRI, Reg);
17672 return !
TRI->isSGPRReg(
MRI, Reg);
17676 unsigned AS = L->getAddressSpace();
17680 case ISD::CALLSEQ_END:
17709 return A->readMem() &&
A->writeMem();
17730 switch (Ty.getScalarSizeInBits()) {
17742 const APInt &DemandedElts,
17745 unsigned Depth)
const {
17750 if (Info->getMode().DX10Clamp)
17762 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
17782 <<
"Hardware instruction generated for atomic "
17784 <<
" operation at memory scope " << MemScope;
17789 Type *EltTy = VT->getElementType();
17790 return VT->getNumElements() == 2 &&
17810 unsigned BW =
IT->getBitWidth();
17811 return BW == 32 || BW == 64;
17825 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
17826 return BW == 32 || BW == 64;
17829 if (Ty->isFloatTy() || Ty->isDoubleTy())
17833 return VT->getNumElements() == 2 &&
17834 VT->getElementType()->getPrimitiveSizeInBits() == 16;
17844 bool HasSystemScope) {
17851 if (HasSystemScope) {
17860 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
17873 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
17899 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
17912 bool HasSystemScope =
17938 if (Subtarget->hasEmulatedSystemScopeAtomics())
17954 if (!HasSystemScope &&
17955 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
17967 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
17975 ConstVal && ConstVal->isNullValue())
18013 if (Ty->isFloatTy()) {
18018 if (Ty->isDoubleTy()) {
18039 if (Ty->isFloatTy() &&
18040 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18053 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18057 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
18061 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18066 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
18071 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18075 if (Ty->isFloatTy()) {
18078 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18081 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18086 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18094 if (Subtarget->hasFlatAtomicFaddF32Inst())
18103 if (Subtarget->hasLDSFPAtomicAddF32()) {
18104 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18106 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18134 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18136 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18140 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18142 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18195 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18196 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18197 : &AMDGPU::SReg_32RegClass;
18198 if (!
TRI->isSGPRClass(RC) && !isDivergent)
18199 return TRI->getEquivalentSGPRClass(RC);
18200 if (
TRI->isSGPRClass(RC) && isDivergent)
18201 return TRI->getEquivalentVGPRClass(RC);
18213 unsigned WaveSize) {
18218 if (!
IT ||
IT->getBitWidth() != WaveSize)
18223 if (!Visited.
insert(V).second)
18225 bool Result =
false;
18226 for (
const auto *U : V->users()) {
18228 if (V == U->getOperand(1)) {
18233 case Intrinsic::amdgcn_if_break:
18234 case Intrinsic::amdgcn_if:
18235 case Intrinsic::amdgcn_else:
18240 if (V == U->getOperand(0)) {
18245 case Intrinsic::amdgcn_end_cf:
18246 case Intrinsic::amdgcn_loop:
18252 Result =
hasCFUser(U, Visited, WaveSize);
18261 const Value *V)
const {
18263 if (CI->isInlineAsm()) {
18272 for (
auto &TC : TargetConstraints) {
18286 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18314 return MRI.hasOneNonDBGUse(N0);
18321 if (
I.getMetadata(
"amdgpu.noclobber"))
18323 if (
I.getMetadata(
"amdgpu.last.use"))
18333 if (!Def->isMachineOpcode())
18343 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18344 PhysReg = AMDGPU::SCC;
18346 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18412 Alignment = RMW->getAlign();
18425 bool FullFlatEmulation =
18427 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18428 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18429 RMW->getType()->isDoubleTy()));
18432 bool ReturnValueIsUsed = !AI->
use_empty();
18441 if (FullFlatEmulation) {
18452 std::prev(BB->
end())->eraseFromParent();
18453 Builder.SetInsertPoint(BB);
18455 Value *LoadedShared =
nullptr;
18456 if (FullFlatEmulation) {
18457 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
18458 {Addr},
nullptr,
"is.shared");
18459 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
18460 Builder.SetInsertPoint(SharedBB);
18461 Value *CastToLocal = Builder.CreateAddrSpaceCast(
18467 LoadedShared = Clone;
18469 Builder.CreateBr(PhiBB);
18470 Builder.SetInsertPoint(CheckPrivateBB);
18473 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
18474 {Addr},
nullptr,
"is.private");
18475 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
18477 Builder.SetInsertPoint(PrivateBB);
18479 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
18482 Value *LoadedPrivate;
18484 LoadedPrivate = Builder.CreateAlignedLoad(
18485 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
18488 LoadedPrivate, RMW->getValOperand());
18490 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
18492 auto [ResultLoad, Equal] =
18498 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
18501 Builder.CreateBr(PhiBB);
18503 Builder.SetInsertPoint(GlobalBB);
18507 if (FullFlatEmulation) {
18508 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
18517 if (!FullFlatEmulation) {
18522 MDNode *RangeNotPrivate =
18525 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
18529 Builder.CreateBr(PhiBB);
18531 Builder.SetInsertPoint(PhiBB);
18533 if (ReturnValueIsUsed) {
18536 if (FullFlatEmulation)
18543 Builder.CreateBr(ExitBB);
18547 unsigned PtrOpIdx) {
18548 Value *PtrOp =
I->getOperand(PtrOpIdx);
18555 I->setOperand(PtrOpIdx, ASCast);
18567 ConstVal && ConstVal->isNullValue()) {
18597 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
18605 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
18620 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static uint32_t getIdentityValueForWaveReduction(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ TC_RETURN_GFX_WholeWave
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ SMULO
Same for multiplication.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const