40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
52#define DEBUG_TYPE "si-lower"
58 cl::desc(
"Do not align and prefetch loops"),
62 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
63 cl::desc(
"Use indirect register addressing for divergent indexes"),
70 cl::desc(
"Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
85 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
86 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
88 return AMDGPU::SGPR0 +
Reg;
160 if (Subtarget->has16BitInsts()) {
161 if (Subtarget->useRealTrue16Insts()) {
203 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
204 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
205 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
206 MVT::i1, MVT::v32i32},
210 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
211 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
212 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
213 MVT::i1, MVT::v32i32},
220 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
221 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
222 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
223 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
224 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
282 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
289 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
290 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
291 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
294 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
295 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
296 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
300 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
301 MVT::v3i16, MVT::v4i16, MVT::Other},
306 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
322 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
323 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
324 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
325 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
326 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
327 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
328 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
329 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
361 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
375 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
389 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
403 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
417 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
432 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
433 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
436 if (Subtarget->hasPkMovB32()) {
449 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
450 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
455 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
459 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
460 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
461 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
462 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
486 if (Subtarget->hasSMemRealTime() ||
491 if (Subtarget->has16BitInsts()) {
498 if (Subtarget->hasMadMacF32Insts())
501 if (!Subtarget->hasBFI())
505 if (!Subtarget->hasBCNT(32))
508 if (!Subtarget->hasBCNT(64))
511 if (Subtarget->hasFFBH())
514 if (Subtarget->hasFFBL())
525 if (Subtarget->hasBFE())
529 if (Subtarget->hasIntClamp())
532 if (Subtarget->hasAddNoCarry())
537 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
538 {MVT::f32, MVT::f64},
Custom);
544 {MVT::f32, MVT::f64},
Legal);
546 if (Subtarget->haveRoundOpsF64())
569 if (Subtarget->has16BitInsts()) {
618 ISD::FSIN, ISD::FROUND},
622 if (Subtarget->hasBF16TransInsts())
641 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
642 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
643 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
776 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
777 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
778 MVT::v32f16, MVT::v32bf16},
782 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
788 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
792 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
796 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
797 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
805 if (Subtarget->hasVOP3PInsts()) {
816 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
819 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
820 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
821 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
824 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
832 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
838 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
839 {MVT::v2f16, MVT::v4f16},
Custom);
845 if (Subtarget->hasPackedFP32Ops()) {
849 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
856 if (Subtarget->has16BitInsts()) {
869 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
870 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
871 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
872 MVT::v32f16, MVT::v32bf16},
877 if (Subtarget->hasVectorMulU64())
879 else if (Subtarget->hasScalarSMulU64())
882 if (Subtarget->hasMad64_32())
885 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
888 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
890 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
893 if (Subtarget->hasMinimum3Maximum3F32())
896 if (Subtarget->hasMinimum3Maximum3PKF16()) {
900 if (!Subtarget->hasMinimum3Maximum3F16())
905 if (Subtarget->hasVOP3PInsts()) {
908 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
912 if (Subtarget->hasIntMinMax64())
917 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
918 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
923 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
924 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
925 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
926 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
930 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
931 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
932 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
933 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
948 if (Subtarget->hasBF16ConversionInsts()) {
953 if (Subtarget->hasBF16PackedInsts()) {
959 if (Subtarget->hasBF16TransInsts()) {
963 if (Subtarget->hasCvtPkF16F32Inst()) {
965 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1015 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1024 ISD::ATOMIC_CMP_SWAP,
1025 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1027 ISD::ATOMIC_LOAD_ADD,
1028 ISD::ATOMIC_LOAD_SUB,
1029 ISD::ATOMIC_LOAD_AND,
1030 ISD::ATOMIC_LOAD_OR,
1031 ISD::ATOMIC_LOAD_XOR,
1032 ISD::ATOMIC_LOAD_NAND,
1033 ISD::ATOMIC_LOAD_MIN,
1034 ISD::ATOMIC_LOAD_MAX,
1035 ISD::ATOMIC_LOAD_UMIN,
1036 ISD::ATOMIC_LOAD_UMAX,
1037 ISD::ATOMIC_LOAD_FADD,
1038 ISD::ATOMIC_LOAD_FMIN,
1039 ISD::ATOMIC_LOAD_FMAX,
1040 ISD::ATOMIC_LOAD_UINC_WRAP,
1041 ISD::ATOMIC_LOAD_UDEC_WRAP,
1054 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1067 EVT DestVT,
EVT SrcVT)
const {
1069 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1070 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1072 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1079 LLT DestTy,
LLT SrcTy)
const {
1080 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1081 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1083 SrcTy.getScalarSizeInBits() == 16 &&
1104 if (Subtarget->has16BitInsts()) {
1107 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1109 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1113 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1135 if (
Size == 16 && Subtarget->has16BitInsts())
1136 return (NumElts + 1) / 2;
1142 return NumElts * ((
Size + 31) / 32);
1151 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1159 if (
Size == 16 && Subtarget->has16BitInsts()) {
1160 if (ScalarVT == MVT::bf16) {
1161 RegisterVT = MVT::i32;
1162 IntermediateVT = MVT::v2bf16;
1164 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1165 IntermediateVT = RegisterVT;
1167 NumIntermediates = (NumElts + 1) / 2;
1168 return NumIntermediates;
1173 IntermediateVT = RegisterVT;
1174 NumIntermediates = NumElts;
1175 return NumIntermediates;
1180 RegisterVT = MVT::i16;
1181 IntermediateVT = ScalarVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1187 RegisterVT = MVT::i32;
1188 IntermediateVT = ScalarVT;
1189 NumIntermediates = NumElts;
1190 return NumIntermediates;
1194 RegisterVT = MVT::i32;
1195 IntermediateVT = RegisterVT;
1196 NumIntermediates = NumElts * ((
Size + 31) / 32);
1197 return NumIntermediates;
1202 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1207 unsigned MaxNumLanes) {
1208 assert(MaxNumLanes != 0);
1212 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1223 unsigned MaxNumLanes) {
1229 assert(ST->getNumContainedTypes() == 2 &&
1230 ST->getContainedType(1)->isIntegerTy(32));
1244 return MVT::amdgpuBufferFatPointer;
1246 DL.getPointerSizeInBits(AS) == 192)
1247 return MVT::amdgpuBufferStridedPointer;
1256 DL.getPointerSizeInBits(AS) == 160) ||
1258 DL.getPointerSizeInBits(AS) == 192))
1265 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1266 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1267 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1269 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1270 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1271 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1272 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1273 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1275 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1276 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1277 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1278 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1279 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1281 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1282 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1283 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1284 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1285 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1324 unsigned IntrID)
const {
1326 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1344 if (RsrcIntr->IsImage) {
1359 Info.ptrVal = RsrcArg;
1362 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1371 if (RsrcIntr->IsImage) {
1372 unsigned MaxNumLanes = 4;
1387 std::numeric_limits<unsigned>::max());
1397 if (RsrcIntr->IsImage) {
1418 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1420 Info.memVT = MVT::i32;
1427 case Intrinsic::amdgcn_raw_buffer_load_lds:
1428 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1429 case Intrinsic::amdgcn_struct_buffer_load_lds:
1430 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1436 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1437 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1438 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1439 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1442 std::numeric_limits<unsigned>::max());
1452 case Intrinsic::amdgcn_ds_ordered_add:
1453 case Intrinsic::amdgcn_ds_ordered_swap: {
1466 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1467 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1470 Info.ptrVal =
nullptr;
1475 case Intrinsic::amdgcn_ds_append:
1476 case Intrinsic::amdgcn_ds_consume: {
1489 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1490 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1491 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1496 Info.memVT = MVT::i64;
1502 case Intrinsic::amdgcn_global_atomic_csub: {
1511 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1512 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1513 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1516 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1519 ->getElementType(0));
1527 case Intrinsic::amdgcn_global_atomic_fmin_num:
1528 case Intrinsic::amdgcn_global_atomic_fmax_num:
1529 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1530 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1531 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1532 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1542 case Intrinsic::amdgcn_flat_load_monitor_b32:
1543 case Intrinsic::amdgcn_flat_load_monitor_b64:
1544 case Intrinsic::amdgcn_flat_load_monitor_b128:
1545 case Intrinsic::amdgcn_global_load_monitor_b32:
1546 case Intrinsic::amdgcn_global_load_monitor_b64:
1547 case Intrinsic::amdgcn_global_load_monitor_b128:
1548 case Intrinsic::amdgcn_cluster_load_b32:
1549 case Intrinsic::amdgcn_cluster_load_b64:
1550 case Intrinsic::amdgcn_cluster_load_b128:
1551 case Intrinsic::amdgcn_ds_load_tr6_b96:
1552 case Intrinsic::amdgcn_ds_load_tr4_b64:
1553 case Intrinsic::amdgcn_ds_load_tr8_b64:
1554 case Intrinsic::amdgcn_ds_load_tr16_b128:
1555 case Intrinsic::amdgcn_global_load_tr6_b96:
1556 case Intrinsic::amdgcn_global_load_tr4_b64:
1557 case Intrinsic::amdgcn_global_load_tr_b64:
1558 case Intrinsic::amdgcn_global_load_tr_b128:
1559 case Intrinsic::amdgcn_ds_read_tr4_b64:
1560 case Intrinsic::amdgcn_ds_read_tr6_b96:
1561 case Intrinsic::amdgcn_ds_read_tr8_b64:
1562 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1570 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1571 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1572 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1580 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1581 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1582 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1590 case Intrinsic::amdgcn_ds_gws_init:
1591 case Intrinsic::amdgcn_ds_gws_barrier:
1592 case Intrinsic::amdgcn_ds_gws_sema_v:
1593 case Intrinsic::amdgcn_ds_gws_sema_br:
1594 case Intrinsic::amdgcn_ds_gws_sema_p:
1595 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1605 Info.memVT = MVT::i32;
1607 Info.align =
Align(4);
1609 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1615 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1616 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1617 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1618 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1619 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1620 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1621 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1622 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1629 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1630 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1631 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1632 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1639 case Intrinsic::amdgcn_load_to_lds:
1640 case Intrinsic::amdgcn_global_load_lds: {
1648 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1649 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1650 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1651 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1661 Info.memVT = MVT::i32;
1663 Info.align =
Align(4);
1668 case Intrinsic::amdgcn_s_prefetch_data:
1669 case Intrinsic::amdgcn_flat_prefetch:
1670 case Intrinsic::amdgcn_global_prefetch: {
1685 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1688 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1689 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1701 Type *&AccessTy)
const {
1703 switch (
II->getIntrinsicID()) {
1704 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1705 case Intrinsic::amdgcn_cluster_load_b128:
1706 case Intrinsic::amdgcn_cluster_load_b64:
1707 case Intrinsic::amdgcn_cluster_load_b32:
1708 case Intrinsic::amdgcn_ds_append:
1709 case Intrinsic::amdgcn_ds_consume:
1710 case Intrinsic::amdgcn_ds_load_tr8_b64:
1711 case Intrinsic::amdgcn_ds_load_tr16_b128:
1712 case Intrinsic::amdgcn_ds_load_tr4_b64:
1713 case Intrinsic::amdgcn_ds_load_tr6_b96:
1714 case Intrinsic::amdgcn_ds_read_tr4_b64:
1715 case Intrinsic::amdgcn_ds_read_tr6_b96:
1716 case Intrinsic::amdgcn_ds_read_tr8_b64:
1717 case Intrinsic::amdgcn_ds_read_tr16_b64:
1718 case Intrinsic::amdgcn_ds_ordered_add:
1719 case Intrinsic::amdgcn_ds_ordered_swap:
1720 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1721 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1722 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1723 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1724 case Intrinsic::amdgcn_flat_load_monitor_b128:
1725 case Intrinsic::amdgcn_flat_load_monitor_b32:
1726 case Intrinsic::amdgcn_flat_load_monitor_b64:
1727 case Intrinsic::amdgcn_global_atomic_csub:
1728 case Intrinsic::amdgcn_global_atomic_fmax_num:
1729 case Intrinsic::amdgcn_global_atomic_fmin_num:
1730 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1731 case Intrinsic::amdgcn_global_load_monitor_b128:
1732 case Intrinsic::amdgcn_global_load_monitor_b32:
1733 case Intrinsic::amdgcn_global_load_monitor_b64:
1734 case Intrinsic::amdgcn_global_load_tr_b64:
1735 case Intrinsic::amdgcn_global_load_tr_b128:
1736 case Intrinsic::amdgcn_global_load_tr4_b64:
1737 case Intrinsic::amdgcn_global_load_tr6_b96:
1738 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1739 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1740 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1741 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1742 Ptr =
II->getArgOperand(0);
1744 case Intrinsic::amdgcn_load_to_lds:
1745 case Intrinsic::amdgcn_global_load_lds:
1746 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1747 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1748 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1749 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1750 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1751 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1752 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1753 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1754 Ptr =
II->getArgOperand(1);
1759 AccessTy =
II->getType();
1765 unsigned AddrSpace)
const {
1766 if (!Subtarget->hasFlatInstOffsets()) {
1777 return AM.
Scale == 0 &&
1778 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1779 AM.
BaseOffs, AddrSpace, FlatVariant));
1783 if (Subtarget->hasFlatGlobalInsts())
1786 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1799 return isLegalMUBUFAddressingMode(AM);
1802bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1813 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1825 if (AM.HasBaseReg) {
1857 return isLegalMUBUFAddressingMode(AM);
1859 if (!Subtarget->hasScalarSubwordLoads()) {
1864 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1912 return Subtarget->enableFlatScratch()
1914 : isLegalMUBUFAddressingMode(AM);
1961 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1970 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
1973 Align RequiredAlignment(
1975 if (Subtarget->hasLDSMisalignedBug() &&
Size > 32 &&
1976 Alignment < RequiredAlignment)
1991 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
1997 RequiredAlignment =
Align(4);
1999 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2015 *IsFast = (Alignment >= RequiredAlignment) ? 64
2016 : (Alignment <
Align(4)) ? 32
2023 if (!Subtarget->hasDS96AndDS128())
2029 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2038 *IsFast = (Alignment >= RequiredAlignment) ? 96
2039 : (Alignment <
Align(4)) ? 32
2046 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2052 RequiredAlignment =
Align(8);
2054 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2063 *IsFast = (Alignment >= RequiredAlignment) ? 128
2064 : (Alignment <
Align(4)) ? 32
2081 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2083 return Alignment >= RequiredAlignment ||
2084 Subtarget->hasUnalignedDSAccessEnabled();
2092 bool AlignedBy4 = Alignment >=
Align(4);
2094 *IsFast = AlignedBy4;
2096 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
2105 return Alignment >=
Align(4) ||
2106 Subtarget->hasUnalignedBufferAccessEnabled();
2118 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2133 return Size >= 32 && Alignment >=
Align(4);
2138 unsigned *IsFast)
const {
2140 Alignment, Flags, IsFast);
2145 const AttributeList &FuncAttributes)
const {
2151 if (
Op.size() >= 16 &&
2155 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2173 unsigned DestAS)
const {
2176 Subtarget->hasGloballyAddressableScratch()) {
2206 unsigned Index)
const {
2222 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2249 auto [InputPtrReg, RC, ArgTy] =
2259 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2265 const SDLoc &SL)
const {
2272 const SDLoc &SL)
const {
2275 std::optional<uint32_t> KnownSize =
2277 if (KnownSize.has_value())
2303 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2312SDValue SITargetLowering::lowerKernargMemParameter(
2324 int64_t OffsetDiff =
Offset - AlignDownOffset;
2330 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2339 ArgVal = DAG.
getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2340 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2350 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2398 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2403SDValue SITargetLowering::getPreloadedValue(
2406 const ArgDescriptor *
Reg =
nullptr;
2407 const TargetRegisterClass *RC;
2411 const ArgDescriptor WorkGroupIDX =
2419 const ArgDescriptor WorkGroupIDZ =
2421 if (Subtarget->hasArchitectedSGPRs() &&
2426 Reg = &WorkGroupIDX;
2427 RC = &AMDGPU::SReg_32RegClass;
2431 Reg = &WorkGroupIDY;
2432 RC = &AMDGPU::SReg_32RegClass;
2436 Reg = &WorkGroupIDZ;
2437 RC = &AMDGPU::SReg_32RegClass;
2468 for (
unsigned I = 0,
E = Ins.size(), PSInputNum = 0;
I !=
E; ++
I) {
2472 "vector type argument should have been split");
2477 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2485 "unexpected vector split in ps argument type");
2499 Info->markPSInputAllocated(PSInputNum);
2501 Info->markPSInputEnabled(PSInputNum);
2517 if (Info.hasWorkItemIDX()) {
2523 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2527 if (Info.hasWorkItemIDY()) {
2528 assert(Info.hasWorkItemIDX());
2529 if (Subtarget->hasPackedTID()) {
2530 Info.setWorkItemIDY(
2533 unsigned Reg = AMDGPU::VGPR1;
2541 if (Info.hasWorkItemIDZ()) {
2542 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2543 if (Subtarget->hasPackedTID()) {
2544 Info.setWorkItemIDZ(
2547 unsigned Reg = AMDGPU::VGPR2;
2567 if (RegIdx == ArgVGPRs.
size()) {
2574 unsigned Reg = ArgVGPRs[RegIdx];
2586 unsigned NumArgRegs) {
2589 if (RegIdx == ArgSGPRs.
size())
2592 unsigned Reg = ArgSGPRs[RegIdx];
2634 const unsigned Mask = 0x3ff;
2637 if (Info.hasWorkItemIDX()) {
2639 Info.setWorkItemIDX(Arg);
2642 if (Info.hasWorkItemIDY()) {
2644 Info.setWorkItemIDY(Arg);
2647 if (Info.hasWorkItemIDZ())
2659 const unsigned Mask = 0x3ff;
2668 auto &
ArgInfo = Info.getArgInfo();
2680 if (Info.hasImplicitArgPtr())
2688 if (Info.hasWorkGroupIDX())
2691 if (Info.hasWorkGroupIDY())
2694 if (Info.hasWorkGroupIDZ())
2697 if (Info.hasLDSKernelId())
2708 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2709 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2715 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2716 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2721 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2722 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2728 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2734 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
2743 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2748 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
2749 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2754 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
2755 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2770 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2772 bool InPreloadSequence =
true;
2774 bool AlignedForImplictArgs =
false;
2775 unsigned ImplicitArgOffset = 0;
2776 for (
auto &Arg :
F.args()) {
2777 if (!InPreloadSequence || !Arg.hasInRegAttr())
2780 unsigned ArgIdx = Arg.getArgNo();
2783 if (InIdx < Ins.size() &&
2784 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2787 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2788 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2790 assert(ArgLocs[ArgIdx].isMemLoc());
2791 auto &ArgLoc = ArgLocs[InIdx];
2793 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2795 unsigned NumAllocSGPRs =
2796 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2799 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2800 if (!AlignedForImplictArgs) {
2802 alignTo(LastExplicitArgOffset,
2803 Subtarget->getAlignmentForImplicitArgPtr()) -
2804 LastExplicitArgOffset;
2805 AlignedForImplictArgs =
true;
2807 ArgOffset += ImplicitArgOffset;
2811 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2812 assert(InIdx >= 1 &&
"No previous SGPR");
2813 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2814 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2818 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2819 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2822 InPreloadSequence =
false;
2828 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2830 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2832 if (PreloadRegs->
size() > 1)
2833 RC = &AMDGPU::SGPR_32RegClass;
2834 for (
auto &Reg : *PreloadRegs) {
2840 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2849 if (Info.hasLDSKernelId()) {
2850 Register Reg = Info.addLDSKernelId();
2851 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2860 bool IsShader)
const {
2861 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2862 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2868 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2870 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2874 unsigned NumRequiredSystemSGPRs =
2875 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2876 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2877 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2878 Register Reg = Info.addReservedUserSGPR();
2879 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2884 if (!HasArchitectedSGPRs) {
2885 if (Info.hasWorkGroupIDX()) {
2886 Register Reg = Info.addWorkGroupIDX();
2887 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2891 if (Info.hasWorkGroupIDY()) {
2892 Register Reg = Info.addWorkGroupIDY();
2893 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2897 if (Info.hasWorkGroupIDZ()) {
2898 Register Reg = Info.addWorkGroupIDZ();
2899 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2904 if (Info.hasWorkGroupInfo()) {
2905 Register Reg = Info.addWorkGroupInfo();
2906 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2910 if (Info.hasPrivateSegmentWaveByteOffset()) {
2912 unsigned PrivateSegmentWaveByteOffsetReg;
2915 PrivateSegmentWaveByteOffsetReg =
2916 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2920 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2922 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2925 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2927 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2928 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2931 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2932 Info.getNumPreloadedSGPRs() >= 16);
2947 if (HasStackObjects)
2948 Info.setHasNonSpillStackObjects(
true);
2953 HasStackObjects =
true;
2957 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2959 if (!ST.enableFlatScratch()) {
2960 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2967 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2969 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2979 Info.setScratchRSrcReg(ReservedBufferReg);
2998 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2999 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3006 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3007 if (!
MRI.isLiveIn(
Reg)) {
3008 Info.setStackPtrOffsetReg(
Reg);
3013 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3020 if (ST.getFrameLowering()->hasFP(MF)) {
3021 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3037 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3046 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3047 RC = &AMDGPU::SGPR_64RegClass;
3048 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3049 RC = &AMDGPU::SGPR_32RegClass;
3055 Entry->addLiveIn(*
I);
3060 for (
auto *Exit : Exits)
3062 TII->get(TargetOpcode::COPY), *
I)
3077 bool IsError =
false;
3081 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3099 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3100 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3102 if (!Subtarget->enableFlatScratch())
3107 !Subtarget->hasArchitectedSGPRs())
3108 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3109 !Info->hasWorkGroupIDZ());
3112 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3130 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3131 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3134 Info->markPSInputAllocated(0);
3135 Info->markPSInputEnabled(0);
3137 if (Subtarget->isAmdPalOS()) {
3146 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3147 if ((PsInputBits & 0x7F) == 0 ||
3148 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3151 }
else if (IsKernel) {
3152 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3154 Splits.
append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3164 if (IsKernel && Subtarget->hasKernargPreload())
3168 }
else if (!IsGraphics) {
3173 if (!Subtarget->enableFlatScratch())
3185 Info->setNumWaveDispatchSGPRs(
3187 Info->setNumWaveDispatchVGPRs(
3189 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3190 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3195 if (IsWholeWaveFunc) {
3197 {MVT::i1, MVT::Other}, Chain);
3209 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3220 if (IsEntryFunc && VA.
isMemLoc()) {
3243 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3247 int64_t OffsetDiff =
Offset - AlignDownOffset;
3254 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3264 ArgVal = DAG.
getNode(ISD::BITCAST,
DL, MemVT, ArgVal);
3265 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3266 Ins[i].Flags.isSExt(), &Ins[i]);
3274 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3277 if (PreloadRegs.
size() == 1) {
3278 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3283 TRI->getRegSizeInBits(*RC)));
3291 for (
auto Reg : PreloadRegs) {
3298 PreloadRegs.size()),
3315 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3316 Ins[i].Flags.isSExt(), &Ins[i]);
3328 "hidden argument in kernel signature was not preloaded",
3334 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3335 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3355 if (!IsEntryFunc && VA.
isMemLoc()) {
3356 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3367 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3368 RC = &AMDGPU::VGPR_32RegClass;
3369 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3370 RC = &AMDGPU::SGPR_32RegClass;
3398 Val = DAG.
getNode(ISD::BITCAST,
DL, ValVT, Val);
3430 Info->setBytesInStackArgArea(StackArgSize);
3432 return Chains.
empty() ? Chain
3441 const Type *RetTy)
const {
3449 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3454 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3455 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3456 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3457 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3480 Info->setIfReturnsVoid(Outs.
empty());
3481 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3500 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3501 ++
I, ++RealRVLocIdx) {
3505 SDValue Arg = OutVals[RealRVLocIdx];
3528 ReadFirstLane, Arg);
3535 if (!Info->isEntryFunction()) {
3541 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3543 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3644 auto &ArgUsageInfo =
3646 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3672 const auto [OutgoingArg, ArgRC, ArgTy] =
3677 const auto [IncomingArg, IncomingArgRC, Ty] =
3679 assert(IncomingArgRC == ArgRC);
3682 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3690 InputReg = getImplicitArgPtr(DAG,
DL);
3692 std::optional<uint32_t> Id =
3694 if (Id.has_value()) {
3705 if (OutgoingArg->isRegister()) {
3706 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3707 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3710 unsigned SpecialArgOffset =
3721 auto [OutgoingArg, ArgRC, Ty] =
3724 std::tie(OutgoingArg, ArgRC, Ty) =
3727 std::tie(OutgoingArg, ArgRC, Ty) =
3742 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3743 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3744 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3749 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3757 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
3767 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
3776 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3777 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3788 : IncomingArgY ? *IncomingArgY
3795 if (OutgoingArg->isRegister()) {
3797 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3823 if (Callee->isDivergent())
3830 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3834 if (!CallerPreserved)
3837 bool CCMatch = CallerCC == CalleeCC;
3850 if (Arg.hasByValAttr())
3864 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3865 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3874 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3887 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
3889 if (!CCVA.isRegLoc())
3894 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3896 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
3920enum ChainCallArgIdx {
3942 bool UsesDynamicVGPRs =
false;
3943 if (IsChainCallConv) {
3948 auto RequestedExecIt =
3950 return Arg.OrigArgIndex == 2;
3952 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
3954 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
3957 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
3960 "Haven't popped all the special args");
3963 CLI.
Args[ChainCallArgIdx::Exec];
3964 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
3972 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
3974 ChainCallSpecialArgs.
push_back(Arg.Node);
3977 PushNodeOrTargetConstant(RequestedExecArg);
3983 if (FlagsValue.
isZero()) {
3984 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
3986 "no additional args allowed if flags == 0");
3988 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
3992 if (!Subtarget->isWave32()) {
3994 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
3997 UsesDynamicVGPRs =
true;
3998 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
3999 CLI.
Args.end(), PushNodeOrTargetConstant);
4008 bool IsSibCall =
false;
4022 "unsupported call to variadic function ");
4030 "unsupported required tail call to function ");
4035 Outs, OutVals, Ins, DAG);
4039 "site marked musttail or on llvm.amdgcn.cs.chain");
4046 if (!TailCallOpt && IsTailCall)
4086 auto *
TRI = Subtarget->getRegisterInfo();
4093 if (!IsSibCall || IsChainCallConv) {
4094 if (!Subtarget->enableFlatScratch()) {
4100 RegsToPass.emplace_back(IsChainCallConv
4101 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4102 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4109 const unsigned NumSpecialInputs = RegsToPass.size();
4111 MVT PtrVT = MVT::i32;
4114 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4142 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4150 int32_t
Offset = LocMemOffset;
4157 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4163 ? Flags.getNonZeroByValAlign()
4190 if (Outs[i].Flags.isByVal()) {
4192 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4195 Outs[i].Flags.getNonZeroByValAlign(),
4197 nullptr, std::nullopt, DstInfo,
4203 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4209 if (!MemOpChains.
empty())
4217 TokenGlue = DAG.
getNode(ISD::CONVERGENCECTRL_GLUE,
DL, MVT::Glue,
4225 unsigned ArgIdx = 0;
4226 for (
auto [Reg, Val] : RegsToPass) {
4227 if (ArgIdx++ >= NumSpecialInputs &&
4228 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4254 if (IsTailCall && !IsSibCall) {
4259 std::vector<SDValue>
Ops({Chain});
4265 Ops.push_back(Callee);
4282 Ops.push_back(Callee);
4293 if (IsChainCallConv)
4298 for (
auto &[Reg, Val] : RegsToPass)
4302 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4303 assert(Mask &&
"Missing call preserved mask for calling convention");
4313 MVT::Glue, GlueOps),
4318 Ops.push_back(InGlue);
4338 if (Info->isWholeWaveFunction())
4346 Chain =
Call.getValue(0);
4347 InGlue =
Call.getValue(1);
4349 uint64_t CalleePopBytes = NumBytes;
4370 EVT VT =
Op.getValueType();
4384 "Stack grows upwards for AMDGPU");
4386 Chain = BaseAddr.getValue(1);
4388 if (Alignment > StackAlign) {
4390 << Subtarget->getWavefrontSizeLog2();
4391 uint64_t StackAlignMask = ScaledAlignment - 1;
4398 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4404 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4415 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4431 if (
Op.getValueType() != MVT::i32)
4450 assert(
Op.getValueType() == MVT::i32);
4459 Op.getOperand(0), IntrinID, GetRoundBothImm);
4493 SDValue RoundModeTimesNumBits =
4513 TableEntry, EnumOffset);
4529 static_cast<uint32_t>(ConstMode->getZExtValue()),
4541 if (UseReducedTable) {
4547 SDValue RoundModeTimesNumBits =
4567 SDValue RoundModeTimesNumBits =
4576 NewMode = TruncTable;
4585 ReadFirstLaneID, NewMode);
4598 IntrinID, RoundBothImm, NewMode);
4604 if (
Op->isDivergent() &&
4605 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4615 if (Subtarget->hasSafeSmemPrefetch())
4623 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4632 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4633 EVT SrcVT = Src.getValueType();
4642 EVT DstVT =
Op.getValueType();
4646 return DAG.
getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4651 if (
Op.getValueType() != MVT::i64)
4665 Op.getOperand(0), IntrinID, ModeHwRegImm);
4667 Op.getOperand(0), IntrinID, TrapHwRegImm);
4674 SDValue Result = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4681 if (
Op.getOperand(1).getValueType() != MVT::i64)
4693 ReadFirstLaneID, NewModeReg);
4695 ReadFirstLaneID, NewTrapReg);
4697 unsigned ModeHwReg =
4700 unsigned TrapHwReg =
4708 IntrinID, ModeHwRegImm, NewModeReg);
4711 IntrinID, TrapHwRegImm, NewTrapReg);
4720 .
Case(
"m0", AMDGPU::M0)
4721 .
Case(
"exec", AMDGPU::EXEC)
4722 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4723 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4724 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4725 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4726 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4731 if (!Subtarget->hasFlatScrRegister() &&
4732 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4734 "\" for subtarget."));
4739 case AMDGPU::EXEC_LO:
4740 case AMDGPU::EXEC_HI:
4741 case AMDGPU::FLAT_SCR_LO:
4742 case AMDGPU::FLAT_SCR_HI:
4747 case AMDGPU::FLAT_SCR:
4766 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4775static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4797 auto Next = std::next(
I);
4808 MBB.addSuccessor(LoopBB);
4810 return std::pair(LoopBB, RemainderBB);
4817 auto I =
MI.getIterator();
4818 auto E = std::next(
I);
4840 Src->setIsKill(
false);
4850 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
4856 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4859 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4883 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4884 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4893 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4894 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4896 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4897 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4905 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4912 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4916 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4922 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4923 : AMDGPU::S_AND_SAVEEXEC_B64),
4927 MRI.setSimpleHint(NewExec, CondReg);
4929 if (UseGPRIdxMode) {
4931 SGPRIdxReg = CurrentIdxReg;
4933 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4934 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4944 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4951 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4954 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4955 : AMDGPU::S_XOR_B64_term),
4979 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4980 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4988 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
4990 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4991 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4992 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4993 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5008 InitResultReg, DstReg, PhiReg, TmpExec,
5009 Offset, UseGPRIdxMode, SGPRIdxReg);
5015 LoopBB->removeSuccessor(RemainderBB);
5017 LoopBB->addSuccessor(LandingPad);
5028static std::pair<unsigned, int>
5032 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5037 return std::pair(AMDGPU::sub0,
Offset);
5077 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5094 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5095 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5104 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5107 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5111 if (UseGPRIdxMode) {
5118 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5131 MI.eraseFromParent();
5140 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5141 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5147 UseGPRIdxMode, SGPRIdxReg);
5151 if (UseGPRIdxMode) {
5153 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5155 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5160 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5165 MI.eraseFromParent();
5182 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5192 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5194 if (Idx->
getReg() == AMDGPU::NoRegister) {
5205 MI.eraseFromParent();
5210 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5214 if (UseGPRIdxMode) {
5218 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5227 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5228 TRI.getRegSizeInBits(*VecRC), 32,
false);
5234 MI.eraseFromParent();
5244 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5248 UseGPRIdxMode, SGPRIdxReg);
5251 if (UseGPRIdxMode) {
5253 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5255 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5261 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5262 TRI.getRegSizeInBits(*VecRC), 32,
false);
5263 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5269 MI.eraseFromParent();
5285 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5286 if (ST.hasScalarAddSub64()) {
5287 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5297 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5298 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5301 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5303 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5306 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5308 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5310 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5311 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5320 MI.eraseFromParent();
5326 case AMDGPU::S_MIN_U32:
5327 return std::numeric_limits<uint32_t>::max();
5328 case AMDGPU::S_MIN_I32:
5329 return std::numeric_limits<int32_t>::max();
5330 case AMDGPU::S_MAX_U32:
5331 return std::numeric_limits<uint32_t>::min();
5332 case AMDGPU::S_MAX_I32:
5333 return std::numeric_limits<int32_t>::min();
5334 case AMDGPU::S_ADD_I32:
5335 case AMDGPU::S_SUB_I32:
5336 case AMDGPU::S_OR_B32:
5337 case AMDGPU::S_XOR_B32:
5338 return std::numeric_limits<uint32_t>::min();
5339 case AMDGPU::S_AND_B32:
5340 return std::numeric_limits<uint32_t>::max();
5343 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5349 case AMDGPU::V_CMP_LT_U64_e64:
5350 return std::numeric_limits<uint64_t>::max();
5351 case AMDGPU::V_CMP_LT_I64_e64:
5352 return std::numeric_limits<int64_t>::max();
5353 case AMDGPU::V_CMP_GT_U64_e64:
5354 return std::numeric_limits<uint64_t>::min();
5355 case AMDGPU::V_CMP_GT_I64_e64:
5356 return std::numeric_limits<int64_t>::min();
5357 case AMDGPU::S_ADD_U64_PSEUDO:
5358 case AMDGPU::S_SUB_U64_PSEUDO:
5359 case AMDGPU::S_OR_B64:
5360 case AMDGPU::S_XOR_B64:
5361 return std::numeric_limits<uint64_t>::min();
5362 case AMDGPU::S_AND_B64:
5363 return std::numeric_limits<uint64_t>::max();
5366 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5371 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5372 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5373 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5374 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5375 Opc == AMDGPU::S_XOR_B32;
5389 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5394 case AMDGPU::S_MIN_U32:
5395 case AMDGPU::S_MIN_I32:
5396 case AMDGPU::S_MAX_U32:
5397 case AMDGPU::S_MAX_I32:
5398 case AMDGPU::S_AND_B32:
5399 case AMDGPU::S_OR_B32: {
5405 case AMDGPU::V_CMP_LT_U64_e64:
5406 case AMDGPU::V_CMP_LT_I64_e64:
5407 case AMDGPU::V_CMP_GT_U64_e64:
5408 case AMDGPU::V_CMP_GT_I64_e64:
5409 case AMDGPU::S_AND_B64:
5410 case AMDGPU::S_OR_B64: {
5416 case AMDGPU::S_XOR_B32:
5417 case AMDGPU::S_XOR_B64:
5418 case AMDGPU::S_ADD_I32:
5419 case AMDGPU::S_ADD_U64_PSEUDO:
5420 case AMDGPU::S_SUB_I32:
5421 case AMDGPU::S_SUB_U64_PSEUDO: {
5424 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5426 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5428 bool IsWave32 = ST.isWave32();
5429 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5430 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5431 unsigned BitCountOpc =
5432 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5436 auto NewAccumulator =
5441 case AMDGPU::S_XOR_B32:
5442 case AMDGPU::S_XOR_B64: {
5448 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5451 .
addReg(NewAccumulator->getOperand(0).getReg())
5454 if (
Opc == AMDGPU::S_XOR_B32) {
5460 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5462 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5466 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5469 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5471 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5481 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5489 case AMDGPU::S_SUB_I32: {
5490 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5498 .
addReg(NewAccumulator->getOperand(0).getReg());
5501 case AMDGPU::S_ADD_I32: {
5504 .
addReg(NewAccumulator->getOperand(0).getReg());
5507 case AMDGPU::S_ADD_U64_PSEUDO:
5508 case AMDGPU::S_SUB_U64_PSEUDO: {
5509 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5510 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5512 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5514 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5515 Register CarryReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5516 Register AddReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5518 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5520 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5524 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5527 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5529 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5531 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5534 .
addReg(NewAccumulator->getOperand(0).getReg())
5544 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5546 : NewAccumulator->getOperand(0).getReg();
5557 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5563 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5569 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5601 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5602 Register IdentityValReg =
MRI.createVirtualRegister(DstRegClass);
5603 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5604 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5605 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5606 Register FF1Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5607 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
5609 bool IsWave32 = ST.isWave32();
5610 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5611 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5618 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5622 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5631 I = ComputeLoop->begin();
5633 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5637 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5641 I = ComputeLoop->end();
5644 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5648 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5657 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5659 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5660 Register LaneValReg =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5663 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5665 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5667 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5669 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5673 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5677 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5678 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5684 case AMDGPU::S_OR_B64:
5685 case AMDGPU::S_AND_B64:
5686 case AMDGPU::S_XOR_B64: {
5689 .
addReg(LaneValue->getOperand(0).getReg())
5693 case AMDGPU::V_CMP_GT_I64_e64:
5694 case AMDGPU::V_CMP_GT_U64_e64:
5695 case AMDGPU::V_CMP_LT_I64_e64:
5696 case AMDGPU::V_CMP_LT_U64_e64: {
5697 Register LaneMaskReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5699 MRI.createVirtualRegister(WaveMaskRegClass);
5702 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5703 Register AccumulatorVReg =
MRI.createVirtualRegister(VregClass);
5706 VregClass, AMDGPU::sub0, VSubRegClass);
5709 VregClass, AMDGPU::sub1, VSubRegClass);
5710 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
5717 .
addReg(LaneValue->getOperand(0).getReg())
5718 .
addReg(AccumulatorVReg);
5720 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5721 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
5725 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5726 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5727 .
addReg(LaneValue->getOperand(0).getReg())
5731 case AMDGPU::S_ADD_U64_PSEUDO:
5732 case AMDGPU::S_SUB_U64_PSEUDO: {
5735 .
addReg(LaneValue->getOperand(0).getReg());
5742 unsigned BITSETOpc =
5743 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5744 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5750 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5753 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5755 .
addReg(NewActiveBitsReg)
5757 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5762 MI.eraseFromParent();
5774 switch (
MI.getOpcode()) {
5775 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5777 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5779 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5781 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5783 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5785 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5787 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5789 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5791 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5793 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5795 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5797 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5799 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5801 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5803 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5805 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5807 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5809 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5811 case AMDGPU::S_UADDO_PSEUDO:
5812 case AMDGPU::S_USUBO_PSEUDO: {
5819 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5821 : AMDGPU::S_SUB_I32;
5832 MI.eraseFromParent();
5835 case AMDGPU::S_ADD_U64_PSEUDO:
5836 case AMDGPU::S_SUB_U64_PSEUDO: {
5839 case AMDGPU::V_ADD_U64_PSEUDO:
5840 case AMDGPU::V_SUB_U64_PSEUDO: {
5846 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5852 if (ST.hasAddSubU64Insts()) {
5854 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5855 : AMDGPU::V_SUB_U64_e64),
5860 TII->legalizeOperands(*
I);
5861 MI.eraseFromParent();
5865 if (IsAdd && ST.hasLshlAddU64Inst()) {
5871 TII->legalizeOperands(*
Add);
5872 MI.eraseFromParent();
5876 const auto *CarryRC =
TRI->getWaveMaskRegClass();
5878 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5879 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5881 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
5882 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
5886 : &AMDGPU::VReg_64RegClass;
5889 : &AMDGPU::VReg_64RegClass;
5892 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5894 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5897 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5899 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5902 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5904 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5907 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5914 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5928 TII->legalizeOperands(*LoHalf);
5929 TII->legalizeOperands(*HiHalf);
5930 MI.eraseFromParent();
5933 case AMDGPU::S_ADD_CO_PSEUDO:
5934 case AMDGPU::S_SUB_CO_PSEUDO: {
5948 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5949 ? AMDGPU::S_ADDC_U32
5950 : AMDGPU::S_SUBB_U32;
5952 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5953 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5958 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5959 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5963 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5965 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5971 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
5972 assert(WaveSize == 64 || WaveSize == 32);
5974 if (WaveSize == 64) {
5975 if (ST.hasScalarCompareEq64()) {
5981 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5983 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5985 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5986 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5988 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6009 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6015 MI.eraseFromParent();
6018 case AMDGPU::SI_INIT_M0: {
6021 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6024 MI.eraseFromParent();
6027 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6030 TII->get(AMDGPU::S_CMP_EQ_U32))
6035 case AMDGPU::GET_GROUPSTATICSIZE: {
6040 .
add(
MI.getOperand(0))
6042 MI.eraseFromParent();
6045 case AMDGPU::GET_SHADERCYCLESHILO: {
6060 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6062 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6063 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6065 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6066 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6068 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6072 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6077 .
add(
MI.getOperand(0))
6082 MI.eraseFromParent();
6085 case AMDGPU::SI_INDIRECT_SRC_V1:
6086 case AMDGPU::SI_INDIRECT_SRC_V2:
6087 case AMDGPU::SI_INDIRECT_SRC_V4:
6088 case AMDGPU::SI_INDIRECT_SRC_V8:
6089 case AMDGPU::SI_INDIRECT_SRC_V9:
6090 case AMDGPU::SI_INDIRECT_SRC_V10:
6091 case AMDGPU::SI_INDIRECT_SRC_V11:
6092 case AMDGPU::SI_INDIRECT_SRC_V12:
6093 case AMDGPU::SI_INDIRECT_SRC_V16:
6094 case AMDGPU::SI_INDIRECT_SRC_V32:
6096 case AMDGPU::SI_INDIRECT_DST_V1:
6097 case AMDGPU::SI_INDIRECT_DST_V2:
6098 case AMDGPU::SI_INDIRECT_DST_V4:
6099 case AMDGPU::SI_INDIRECT_DST_V8:
6100 case AMDGPU::SI_INDIRECT_DST_V9:
6101 case AMDGPU::SI_INDIRECT_DST_V10:
6102 case AMDGPU::SI_INDIRECT_DST_V11:
6103 case AMDGPU::SI_INDIRECT_DST_V12:
6104 case AMDGPU::SI_INDIRECT_DST_V16:
6105 case AMDGPU::SI_INDIRECT_DST_V32:
6107 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6108 case AMDGPU::SI_KILL_I1_PSEUDO:
6110 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6119 Register SrcCond =
MI.getOperand(3).getReg();
6121 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6122 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6123 const auto *CondRC =
TRI->getWaveMaskRegClass();
6124 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
6128 : &AMDGPU::VReg_64RegClass;
6131 : &AMDGPU::VReg_64RegClass;
6134 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6136 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6139 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6141 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6144 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6146 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6167 MI.eraseFromParent();
6170 case AMDGPU::SI_BR_UNDEF: {
6174 .
add(
MI.getOperand(0));
6176 MI.eraseFromParent();
6179 case AMDGPU::ADJCALLSTACKUP:
6180 case AMDGPU::ADJCALLSTACKDOWN: {
6187 case AMDGPU::SI_CALL_ISEL: {
6191 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
6194 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6200 MI.eraseFromParent();
6203 case AMDGPU::V_ADD_CO_U32_e32:
6204 case AMDGPU::V_SUB_CO_U32_e32:
6205 case AMDGPU::V_SUBREV_CO_U32_e32: {
6208 unsigned Opc =
MI.getOpcode();
6210 bool NeedClampOperand =
false;
6211 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
6213 NeedClampOperand =
true;
6217 if (
TII->isVOP3(*
I)) {
6222 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
6223 if (NeedClampOperand)
6226 TII->legalizeOperands(*
I);
6228 MI.eraseFromParent();
6231 case AMDGPU::V_ADDC_U32_e32:
6232 case AMDGPU::V_SUBB_U32_e32:
6233 case AMDGPU::V_SUBBREV_U32_e32:
6236 TII->legalizeOperands(
MI);
6238 case AMDGPU::DS_GWS_INIT:
6239 case AMDGPU::DS_GWS_SEMA_BR:
6240 case AMDGPU::DS_GWS_BARRIER:
6241 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
6243 case AMDGPU::DS_GWS_SEMA_V:
6244 case AMDGPU::DS_GWS_SEMA_P:
6245 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6253 case AMDGPU::S_SETREG_B32: {
6269 const unsigned SetMask = WidthMask <<
Offset;
6272 unsigned SetDenormOp = 0;
6273 unsigned SetRoundOp = 0;
6281 SetRoundOp = AMDGPU::S_ROUND_MODE;
6282 SetDenormOp = AMDGPU::S_DENORM_MODE;
6284 SetRoundOp = AMDGPU::S_ROUND_MODE;
6286 SetDenormOp = AMDGPU::S_DENORM_MODE;
6289 if (SetRoundOp || SetDenormOp) {
6292 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6293 unsigned ImmVal = Def->getOperand(1).getImm();
6307 MI.eraseFromParent();
6316 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6320 case AMDGPU::S_INVERSE_BALLOT_U32:
6321 case AMDGPU::S_INVERSE_BALLOT_U64:
6324 MI.setDesc(
TII->get(AMDGPU::COPY));
6326 case AMDGPU::ENDPGM_TRAP: {
6329 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6349 MI.eraseFromParent();
6352 case AMDGPU::SIMULATED_TRAP: {
6353 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6356 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6357 MI.eraseFromParent();
6360 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6361 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6367 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6368 Register OriginalExec = Setup->getOperand(0).getReg();
6370 MI.getOperand(0).setReg(OriginalExec);
6407 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6411 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6438 if (!Subtarget->hasMadMacF32Insts())
6439 return Subtarget->hasFastFMAF32();
6445 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6448 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6464 switch (Ty.getScalarSizeInBits()) {
6482 if (Ty.getScalarSizeInBits() == 16)
6484 if (Ty.getScalarSizeInBits() == 32)
6485 return Subtarget->hasMadMacF32Insts() &&
6495 EVT VT =
N->getValueType(0);
6497 return Subtarget->hasMadMacF32Insts() &&
6499 if (VT == MVT::f16) {
6500 return Subtarget->hasMadF16() &&
6515 unsigned Opc =
Op.getOpcode();
6516 EVT VT =
Op.getValueType();
6517 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6518 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6519 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6520 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6535 unsigned Opc =
Op.getOpcode();
6536 EVT VT =
Op.getValueType();
6537 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6538 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6539 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6540 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6541 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6542 VT == MVT::v32bf16);
6550 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6552 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6559 unsigned Opc =
Op.getOpcode();
6560 EVT VT =
Op.getValueType();
6561 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6562 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6563 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6564 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6565 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6566 VT == MVT::v32bf16);
6571 : std::pair(Op0, Op0);
6580 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
6582 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
6588 switch (
Op.getOpcode()) {
6592 return LowerBRCOND(
Op, DAG);
6594 return LowerRETURNADDR(
Op, DAG);
6597 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6598 "Load should return a value and a chain");
6602 EVT VT =
Op.getValueType();
6604 return lowerFSQRTF32(
Op, DAG);
6606 return lowerFSQRTF64(
Op, DAG);
6611 return LowerTrig(
Op, DAG);
6613 return LowerSELECT(
Op, DAG);
6615 return LowerFDIV(
Op, DAG);
6617 return LowerFFREXP(
Op, DAG);
6618 case ISD::ATOMIC_CMP_SWAP:
6619 return LowerATOMIC_CMP_SWAP(
Op, DAG);
6621 return LowerSTORE(
Op, DAG);
6625 return LowerGlobalAddress(MFI,
Op, DAG);
6628 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
6630 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
6632 return LowerINTRINSIC_VOID(
Op, DAG);
6633 case ISD::ADDRSPACECAST:
6634 return lowerADDRSPACECAST(
Op, DAG);
6636 return lowerINSERT_SUBVECTOR(
Op, DAG);
6638 return lowerINSERT_VECTOR_ELT(
Op, DAG);
6640 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
6642 return lowerVECTOR_SHUFFLE(
Op, DAG);
6644 return lowerSCALAR_TO_VECTOR(
Op, DAG);
6646 return lowerBUILD_VECTOR(
Op, DAG);
6649 return lowerFP_ROUND(
Op, DAG);
6651 return lowerTRAP(
Op, DAG);
6652 case ISD::DEBUGTRAP:
6653 return lowerDEBUGTRAP(
Op, DAG);
6662 return lowerFMINNUM_FMAXNUM(
Op, DAG);
6663 case ISD::FMINIMUMNUM:
6664 case ISD::FMAXIMUMNUM:
6665 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
6668 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
6671 return lowerFLDEXP(
Op, DAG);
6688 case ISD::FMINNUM_IEEE:
6689 case ISD::FMAXNUM_IEEE:
6696 return lowerFCOPYSIGN(
Op, DAG);
6698 return lowerMUL(
Op, DAG);
6701 return lowerXMULO(
Op, DAG);
6704 return lowerXMUL_LOHI(
Op, DAG);
6705 case ISD::DYNAMIC_STACKALLOC:
6707 case ISD::STACKSAVE:
6711 case ISD::SET_ROUNDING:
6715 case ISD::FP_EXTEND:
6718 case ISD::GET_FPENV:
6720 case ISD::SET_FPENV:
6737 EVT FittingLoadVT = LoadVT;
6762 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6766 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6769SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6772 bool IsIntrinsic)
const {
6775 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6776 EVT LoadVT =
M->getValueType(0);
6778 EVT EquivLoadVT = LoadVT;
6792 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
6796 M->getMemoryVT(),
M->getMemOperand());
6807 EVT LoadVT =
M->getValueType(0);
6813 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
6814 bool IsTFE =
M->getNumValues() == 3;
6827 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
6831 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
6832 M->getMemOperand(), DAG);
6836 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
6838 M->getMemOperand(), DAG);
6846 EVT VT =
N->getValueType(0);
6847 unsigned CondCode =
N->getConstantOperandVal(3);
6858 EVT CmpVT =
LHS.getValueType();
6859 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6860 unsigned PromoteOp =
6880 EVT VT =
N->getValueType(0);
6882 unsigned CondCode =
N->getConstantOperandVal(3);
6891 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
6892 Src0 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6893 Src1 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6909 EVT VT =
N->getValueType(0);
6916 Src.getOperand(1), Src.getOperand(2));
6927 Exec = AMDGPU::EXEC_LO;
6929 Exec = AMDGPU::EXEC;
6946 EVT VT =
N->getValueType(0);
6948 unsigned IID =
N->getConstantOperandVal(0);
6949 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6950 IID == Intrinsic::amdgcn_permlanex16;
6951 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6952 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6956 unsigned SplitSize = 32;
6957 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6958 ST->hasDPALU_DPP() &&
6966 case Intrinsic::amdgcn_permlane16:
6967 case Intrinsic::amdgcn_permlanex16:
6968 case Intrinsic::amdgcn_update_dpp:
6973 case Intrinsic::amdgcn_writelane:
6976 case Intrinsic::amdgcn_readlane:
6977 case Intrinsic::amdgcn_set_inactive:
6978 case Intrinsic::amdgcn_set_inactive_chain_arg:
6979 case Intrinsic::amdgcn_mov_dpp8:
6982 case Intrinsic::amdgcn_readfirstlane:
6983 case Intrinsic::amdgcn_permlane64:
6993 if (
SDNode *GL =
N->getGluedNode()) {
6994 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6995 GL = GL->getOperand(0).getNode();
6996 Operands.push_back(DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7005 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7006 IID == Intrinsic::amdgcn_mov_dpp8 ||
7007 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7008 Src1 =
N->getOperand(2);
7009 if (IID == Intrinsic::amdgcn_writelane ||
7010 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7011 Src2 =
N->getOperand(3);
7014 if (ValSize == SplitSize) {
7024 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7029 if (IID == Intrinsic::amdgcn_writelane) {
7034 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7036 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7039 if (ValSize % SplitSize != 0)
7043 EVT VT =
N->getValueType(0);
7047 unsigned NumOperands =
N->getNumOperands();
7049 SDNode *GL =
N->getGluedNode();
7054 for (
unsigned i = 0; i != NE; ++i) {
7055 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7057 SDValue Operand =
N->getOperand(j);
7072 DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7087 if (SplitSize == 32) {
7089 return unrollLaneOp(LaneOp.
getNode());
7095 unsigned SubVecNumElt =
7099 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7100 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7104 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7109 if (IID == Intrinsic::amdgcn_writelane)
7114 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7115 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7116 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7117 EltIdx += SubVecNumElt;
7131 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7134 if (IID == Intrinsic::amdgcn_writelane)
7137 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7145 switch (
N->getOpcode()) {
7157 unsigned IID =
N->getConstantOperandVal(0);
7159 case Intrinsic::amdgcn_make_buffer_rsrc:
7160 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
7162 case Intrinsic::amdgcn_cvt_pkrtz: {
7168 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7171 case Intrinsic::amdgcn_cvt_pknorm_i16:
7172 case Intrinsic::amdgcn_cvt_pknorm_u16:
7173 case Intrinsic::amdgcn_cvt_pk_i16:
7174 case Intrinsic::amdgcn_cvt_pk_u16: {
7180 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7182 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7184 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7189 EVT VT =
N->getValueType(0);
7194 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7198 case Intrinsic::amdgcn_s_buffer_load: {
7204 if (!Subtarget->hasScalarSubwordLoads())
7210 EVT VT =
Op.getValueType();
7211 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
7223 if (!
Offset->isDivergent()) {
7242 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
7247 case Intrinsic::amdgcn_dead: {
7248 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7259 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7260 Results.push_back(Res.getOperand(
I));
7264 Results.push_back(Res.getValue(1));
7273 EVT VT =
N->getValueType(0);
7278 EVT SelectVT = NewVT;
7279 if (NewVT.
bitsLT(MVT::i32)) {
7282 SelectVT = MVT::i32;
7288 if (NewVT != SelectVT)
7294 if (
N->getValueType(0) != MVT::v2f16)
7298 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7306 if (
N->getValueType(0) != MVT::v2f16)
7310 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7318 if (
N->getValueType(0) != MVT::f16)
7333 if (U.get() !=
Value)
7336 if (U.getUser()->getOpcode() == Opcode)
7342unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
7345 case Intrinsic::amdgcn_if:
7347 case Intrinsic::amdgcn_else:
7349 case Intrinsic::amdgcn_loop:
7351 case Intrinsic::amdgcn_end_cf:
7371 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7398 SDNode *Intr = BRCOND.getOperand(1).getNode();
7411 assert(BR &&
"brcond missing unconditional branch user");
7415 unsigned CFNode = isCFIntrinsic(Intr);
7435 Ops.push_back(Target);
7458 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
7477 MVT VT =
Op.getSimpleValueType();
7480 if (
Op.getConstantOperandVal(0) != 0)
7484 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7486 if (
Info->isEntryFunction())
7503 return Op.getValueType().bitsLE(VT)
7511 EVT DstVT =
Op.getValueType();
7518 unsigned Opc =
Op.getOpcode();
7530 EVT SrcVT = Src.getValueType();
7531 EVT DstVT =
Op.getValueType();
7534 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
7537 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
7544 if (DstVT == MVT::f16) {
7549 if (!Subtarget->has16BitInsts()) {
7552 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7554 if (
Op->getFlags().hasApproximateFuncs()) {
7561 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7565 "custom lower FP_ROUND for f16 or bf16");
7566 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
7579 EVT VT =
Op.getValueType();
7581 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7582 bool IsIEEEMode =
Info->getMode().IEEE;
7591 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7598SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
7600 EVT VT =
Op.getValueType();
7602 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7603 bool IsIEEEMode =
Info->getMode().IEEE;
7608 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7616 EVT VT =
Op.getValueType();
7620 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7621 !Subtarget->hasMinimum3Maximum3F16() &&
7622 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7623 "should not need to widen f16 minimum/maximum to v2f16");
7637 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7645 EVT VT =
Op.getValueType();
7649 EVT ExpVT =
Exp.getValueType();
7650 if (ExpVT == MVT::i16)
7671 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
7674 return DAG.
getNode(ISD::FLDEXP,
DL, VT,
Op.getOperand(0), TruncExp);
7678 switch (
Op->getOpcode()) {
7708 DAGCombinerInfo &DCI)
const {
7709 const unsigned Opc =
Op.getOpcode();
7717 :
Op->getOperand(0).getValueType();
7720 if (DCI.isBeforeLegalizeOps() ||
7724 auto &DAG = DCI.DAG;
7730 LHS =
Op->getOperand(1);
7731 RHS =
Op->getOperand(2);
7733 LHS =
Op->getOperand(0);
7734 RHS =
Op->getOperand(1);
7773 if (MagVT == SignVT)
7780 SDValue SignAsInt32 = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7783 SDValue SignAsHalf16 = DAG.
getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7790 EVT VT =
Op.getValueType();
7796 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
7823 if (
Op->isDivergent())
7836 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7838 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7841 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7843 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7849 EVT VT =
Op.getValueType();
7856 const APInt &
C = RHSC->getAPIntValue();
7858 if (
C.isPowerOf2()) {
7860 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
7887 if (
Op->isDivergent()) {
7891 if (Subtarget->hasSMulHi()) {
7902 if (!Subtarget->isTrapHandlerEnabled() ||
7904 return lowerTrapEndpgm(
Op, DAG);
7906 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
7907 : lowerTrapHsaQueuePtr(
Op, DAG);
7917SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
7919 ImplicitParameter Param)
const {
7939 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
7942 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7945 if (UserSGPR == AMDGPU::NoRegister) {
7971 if (Subtarget->hasPrivEnabledTrap2NopBug())
7984 if (!Subtarget->isTrapHandlerEnabled() ||
7988 "debugtrap handler not supported",
7999SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
8001 if (Subtarget->hasApertureRegs()) {
8003 ? AMDGPU::SRC_SHARED_BASE
8004 : AMDGPU::SRC_PRIVATE_BASE;
8005 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8006 !Subtarget->hasGloballyAddressableScratch()) &&
8007 "Cannot use src_private_base with globally addressable scratch!");
8030 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
8039 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
8043 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8045 if (UserSGPR == AMDGPU::NoRegister) {
8079 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8090 const AMDGPUTargetMachine &TM =
8093 unsigned DestAS, SrcAS;
8095 bool IsNonNull =
false;
8097 SrcAS = ASC->getSrcAddressSpace();
8098 Src = ASC->getOperand(0);
8099 DestAS = ASC->getDestAddressSpace();
8102 Op.getConstantOperandVal(0) ==
8103 Intrinsic::amdgcn_addrspacecast_nonnull);
8104 Src =
Op->getOperand(1);
8105 SrcAS =
Op->getConstantOperandVal(2);
8106 DestAS =
Op->getConstantOperandVal(3);
8119 Subtarget->hasGloballyAddressableScratch()) {
8124 AMDGPU::S_MOV_B32, SL, MVT::i32,
8125 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8133 unsigned NullVal = TM.getNullPointerValue(DestAS);
8148 Subtarget->hasGloballyAddressableScratch()) {
8157 if (Subtarget->isWave64())
8163 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8166 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8171 AMDGPU::S_MOV_B64, SL, MVT::i64,
8172 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8174 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8176 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8178 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8184 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8196 Op.getValueType() == MVT::i64) {
8197 const SIMachineFunctionInfo *
Info =
8201 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8205 Src.getValueType() == MVT::i64)
8225 EVT InsVT =
Ins.getValueType();
8233 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
8238 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8240 MVT::i32, InsNumElts / 2);
8242 Vec = DAG.
getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8243 Ins = DAG.
getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8245 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
8247 if (InsNumElts == 2) {
8257 return DAG.
getNode(ISD::BITCAST, SL, VecVT, Vec);
8260 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
8283 if (NumElts == 4 && EltSize == 16 && KIdx) {
8291 SDValue LoVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8292 SDValue HiVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8294 unsigned Idx = KIdx->getZExtValue();
8295 bool InsertLo = Idx < 2;
8298 DAG.
getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8299 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8301 InsHalf = DAG.
getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8305 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8318 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
8346 return DAG.
getNode(ISD::BITCAST, SL, VecVT, BFI);
8353 EVT ResultVT =
Op.getValueType();
8366 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8369 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8373 if (VecSize == 128) {
8381 }
else if (VecSize == 256) {
8384 for (
unsigned P = 0;
P < 4; ++
P) {
8390 Parts[0], Parts[1]));
8392 Parts[2], Parts[3]));
8398 for (
unsigned P = 0;
P < 8; ++
P) {
8405 Parts[0], Parts[1], Parts[2], Parts[3]));
8408 Parts[4], Parts[5], Parts[6], Parts[7]));
8428 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8443 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8445 return DAG.
getNode(ISD::BITCAST, SL, ResultVT, Result);
8453 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8458 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8459 !(Mask[Elt + 1] & 1);
8465 EVT ResultVT =
Op.getValueType();
8468 const int NewSrcNumElts = 2;
8470 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
8486 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
8508 if (ShouldUseConsecutiveExtract &&
8511 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8512 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8524 if (Idx0 >= SrcNumElts) {
8529 if (Idx1 >= SrcNumElts) {
8534 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8535 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8543 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8544 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8549 if (SubVec0 != SubVec1) {
8550 NewMaskIdx1 += NewSrcNumElts;
8557 {NewMaskIdx0, NewMaskIdx1});
8562 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8563 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8564 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8565 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8584 EVT ResultVT =
Op.getValueType();
8600 EVT VT =
Op.getValueType();
8602 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8603 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
8612 return DAG.
getNode(ISD::BITCAST, SL, VT, ExtLo);
8621 return DAG.
getNode(ISD::BITCAST, SL, VT, ShlHi);
8628 return DAG.
getNode(ISD::BITCAST, SL, VT,
Or);
8637 for (
unsigned P = 0;
P < NumParts; ++
P) {
8639 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
8645 return DAG.
getNode(ISD::BITCAST, SL, VT, Blend);
8658 if (!Subtarget->isAmdHsaOS())
8718 EVT PtrVT =
Op.getValueType();
8720 const GlobalValue *GV = GSD->
getGlobal();
8734 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
8752 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8753 if (Subtarget->has64BitLiterals()) {
8784 MachinePointerInfo PtrInfo =
8812 SDValue Param = lowerKernargMemParameter(
8823 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
8831 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
8839 unsigned NumElts = Elts.
size();
8841 if (NumElts <= 12) {
8850 for (
unsigned i = 0; i < Elts.
size(); ++i) {
8856 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
8866 EVT SrcVT = Src.getValueType();
8887 bool Unpacked,
bool IsD16,
int DMaskPop,
8888 int NumVDataDwords,
bool IsAtomicPacked16Bit,
8892 EVT ReqRetVT = ResultTypes[0];
8894 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
8895 ? (ReqRetNumElts + 1) / 2
8898 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
8909 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
8920 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
8922 NumDataDwords - MaskPopDwords);
8927 EVT LegalReqRetVT = ReqRetVT;
8929 if (!
Data.getValueType().isInteger())
8931 Data.getValueType().changeTypeToInteger(),
Data);
8952 if (Result->getNumValues() == 1)
8959 SDValue *LWE,
bool &IsTexFail) {
8979 unsigned DimIdx,
unsigned EndIdx,
8980 unsigned NumGradients) {
8982 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
8990 if (((
I + 1) >= EndIdx) ||
8991 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
8992 I == DimIdx + NumGradients - 1))) {
9011 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9025 int NumVDataDwords = 0;
9026 bool AdjustRetType =
false;
9027 bool IsAtomicPacked16Bit =
false;
9030 const unsigned ArgOffset = WithChain ? 2 : 1;
9033 unsigned DMaskLanes = 0;
9035 if (BaseOpcode->Atomic) {
9036 VData =
Op.getOperand(2);
9038 IsAtomicPacked16Bit =
9039 (Intr->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9040 Intr->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
9043 if (BaseOpcode->AtomicX2) {
9050 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9051 DMask = Is64Bit ? 0xf : 0x3;
9052 NumVDataDwords = Is64Bit ? 4 : 2;
9054 DMask = Is64Bit ? 0x3 : 0x1;
9055 NumVDataDwords = Is64Bit ? 2 : 1;
9058 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
9061 if (BaseOpcode->Store) {
9062 VData =
Op.getOperand(2);
9066 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9070 VData = handleD16VData(VData, DAG,
true);
9073 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
9074 }
else if (!BaseOpcode->NoReturn) {
9079 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9087 (!LoadVT.
isVector() && DMaskLanes > 1))
9093 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9094 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9095 NumVDataDwords = (DMaskLanes + 1) / 2;
9097 NumVDataDwords = DMaskLanes;
9099 AdjustRetType =
true;
9103 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
9110 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9111 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9113 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
9115 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9116 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9120 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
9126 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
9130 "Bias needs to be converted to 16 bit in A16 mode");
9135 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
9139 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
9140 "require 16 bit args for both gradients and addresses");
9145 if (!
ST->hasA16()) {
9146 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
9147 "support 16 bit addresses\n");
9157 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
9159 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9161 IntrOpcode = G16MappingInfo->
G16;
9184 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
9202 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
9203 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
9204 const bool UseNSA =
ST->hasNSAEncoding() &&
9205 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
9206 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
9207 const bool UsePartialNSA =
9208 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
9211 if (UsePartialNSA) {
9213 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9214 }
else if (!UseNSA) {
9221 if (!BaseOpcode->Sampler) {
9224 uint64_t UnormConst =
9225 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
9227 Unorm = UnormConst ? True : False;
9233 bool IsTexFail =
false;
9234 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9245 NumVDataDwords += 1;
9246 AdjustRetType =
true;
9251 if (AdjustRetType) {
9254 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9263 MVT::i32, NumVDataDwords)
9266 ResultTypes[0] = NewVT;
9267 if (ResultTypes.size() == 3) {
9271 ResultTypes.erase(&ResultTypes[1]);
9276 if (BaseOpcode->Atomic)
9283 if (BaseOpcode->Store || BaseOpcode->Atomic)
9284 Ops.push_back(VData);
9285 if (UsePartialNSA) {
9287 Ops.push_back(VAddr);
9291 Ops.push_back(VAddr);
9294 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9296 Ops.push_back(Rsrc);
9297 if (BaseOpcode->Sampler) {
9301 Ops.push_back(Samp);
9306 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9307 Ops.push_back(Unorm);
9309 Ops.push_back(IsA16 &&
9310 ST->hasFeature(AMDGPU::FeatureR128A16)
9314 Ops.push_back(IsA16 ? True : False);
9316 if (!Subtarget->hasGFX90AInsts())
9321 "TFE is not supported on this GPU",
DL.getDebugLoc()));
9324 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9327 Ops.push_back(DimInfo->
DA ? True : False);
9328 if (BaseOpcode->HasD16)
9329 Ops.push_back(IsD16 ? True : False);
9331 Ops.push_back(
Op.getOperand(0));
9333 int NumVAddrDwords =
9339 NumVDataDwords, NumVAddrDwords);
9340 }
else if (IsGFX11Plus) {
9342 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9343 : AMDGPU::MIMGEncGfx11Default,
9344 NumVDataDwords, NumVAddrDwords);
9345 }
else if (IsGFX10Plus) {
9347 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9348 : AMDGPU::MIMGEncGfx10Default,
9349 NumVDataDwords, NumVAddrDwords);
9351 if (Subtarget->hasGFX90AInsts()) {
9353 NumVDataDwords, NumVAddrDwords);
9357 "requested image instruction is not supported on this GPU",
9362 for (EVT VT : OrigResultTypes) {
9363 if (VT == MVT::Other)
9364 RetValues[Idx++] =
Op.getOperand(0);
9375 NumVDataDwords, NumVAddrDwords);
9378 NumVDataDwords, NumVAddrDwords);
9385 MachineMemOperand *MemRef = MemOp->getMemOperand();
9389 if (BaseOpcode->AtomicX2) {
9394 if (BaseOpcode->NoReturn)
9397 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9398 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9411 MachinePointerInfo(),
9416 if (!
Offset->isDivergent()) {
9423 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9432 !Subtarget->hasScalarDwordx3Loads()) {
9459 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9461 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
9465 unsigned NumLoads = 1;
9471 if (NumElts == 8 || NumElts == 16) {
9472 NumLoads = NumElts / 4;
9476 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
9481 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
9483 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
9484 for (
unsigned i = 0; i < NumLoads; ++i) {
9490 if (NumElts == 8 || NumElts == 16)
9498 if (!Subtarget->hasArchitectedSGPRs())
9540 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
9542 EVT VT =
Op.getValueType();
9544 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
9548 switch (IntrinsicID) {
9549 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9552 return getPreloadedValue(DAG, *MFI, VT,
9555 case Intrinsic::amdgcn_dispatch_ptr:
9556 case Intrinsic::amdgcn_queue_ptr: {
9557 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
9559 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
9564 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9567 return getPreloadedValue(DAG, *MFI, VT, RegID);
9569 case Intrinsic::amdgcn_implicitarg_ptr: {
9571 return getImplicitArgPtr(DAG,
DL);
9572 return getPreloadedValue(DAG, *MFI, VT,
9575 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9581 return getPreloadedValue(DAG, *MFI, VT,
9584 case Intrinsic::amdgcn_dispatch_id: {
9587 case Intrinsic::amdgcn_rcp:
9589 case Intrinsic::amdgcn_rsq:
9591 case Intrinsic::amdgcn_rsq_legacy:
9595 case Intrinsic::amdgcn_rcp_legacy:
9599 case Intrinsic::amdgcn_rsq_clamp: {
9610 return DAG.
getNode(ISD::FMAXNUM,
DL, VT, Tmp,
9613 case Intrinsic::r600_read_ngroups_x:
9614 if (Subtarget->isAmdHsaOS())
9617 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9620 case Intrinsic::r600_read_ngroups_y:
9621 if (Subtarget->isAmdHsaOS())
9624 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9627 case Intrinsic::r600_read_ngroups_z:
9628 if (Subtarget->isAmdHsaOS())
9631 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9634 case Intrinsic::r600_read_local_size_x:
9635 if (Subtarget->isAmdHsaOS())
9638 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9640 case Intrinsic::r600_read_local_size_y:
9641 if (Subtarget->isAmdHsaOS())
9644 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9646 case Intrinsic::r600_read_local_size_z:
9647 if (Subtarget->isAmdHsaOS())
9650 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9652 case Intrinsic::amdgcn_workgroup_id_x:
9653 return getPreloadedValue(DAG, *MFI, VT,
9655 case Intrinsic::amdgcn_workgroup_id_y:
9656 return getPreloadedValue(DAG, *MFI, VT,
9658 case Intrinsic::amdgcn_workgroup_id_z:
9659 return getPreloadedValue(DAG, *MFI, VT,
9661 case Intrinsic::amdgcn_wave_id:
9662 return lowerWaveID(DAG,
Op);
9663 case Intrinsic::amdgcn_lds_kernel_id: {
9665 return getLDSKernelId(DAG,
DL);
9666 return getPreloadedValue(DAG, *MFI, VT,
9669 case Intrinsic::amdgcn_workitem_id_x:
9670 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
9671 case Intrinsic::amdgcn_workitem_id_y:
9672 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
9673 case Intrinsic::amdgcn_workitem_id_z:
9674 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
9675 case Intrinsic::amdgcn_wavefrontsize:
9677 SDLoc(
Op), MVT::i32);
9678 case Intrinsic::amdgcn_s_buffer_load: {
9679 unsigned CPol =
Op.getConstantOperandVal(3);
9686 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
9687 Op.getOperand(3), DAG);
9689 case Intrinsic::amdgcn_fdiv_fast:
9690 return lowerFDIV_FAST(
Op, DAG);
9691 case Intrinsic::amdgcn_sin:
9694 case Intrinsic::amdgcn_cos:
9697 case Intrinsic::amdgcn_mul_u24:
9700 case Intrinsic::amdgcn_mul_i24:
9704 case Intrinsic::amdgcn_log_clamp: {
9710 case Intrinsic::amdgcn_fract:
9713 case Intrinsic::amdgcn_class:
9716 case Intrinsic::amdgcn_div_fmas:
9718 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9720 case Intrinsic::amdgcn_div_fixup:
9722 Op.getOperand(2),
Op.getOperand(3));
9724 case Intrinsic::amdgcn_div_scale: {
9737 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
9740 Denominator, Numerator);
9742 case Intrinsic::amdgcn_icmp: {
9744 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
9745 Op.getConstantOperandVal(2) == 0 &&
9750 case Intrinsic::amdgcn_fcmp: {
9753 case Intrinsic::amdgcn_ballot:
9755 case Intrinsic::amdgcn_fmed3:
9757 Op.getOperand(2),
Op.getOperand(3));
9758 case Intrinsic::amdgcn_fdot2:
9760 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9761 case Intrinsic::amdgcn_fmul_legacy:
9764 case Intrinsic::amdgcn_sffbh:
9766 case Intrinsic::amdgcn_sbfe:
9768 Op.getOperand(2),
Op.getOperand(3));
9769 case Intrinsic::amdgcn_ubfe:
9771 Op.getOperand(2),
Op.getOperand(3));
9772 case Intrinsic::amdgcn_cvt_pkrtz:
9773 case Intrinsic::amdgcn_cvt_pknorm_i16:
9774 case Intrinsic::amdgcn_cvt_pknorm_u16:
9775 case Intrinsic::amdgcn_cvt_pk_i16:
9776 case Intrinsic::amdgcn_cvt_pk_u16: {
9778 EVT VT =
Op.getValueType();
9781 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9783 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9785 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9787 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
9793 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
9796 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
9797 return DAG.
getNode(ISD::BITCAST,
DL, VT, Node);
9799 case Intrinsic::amdgcn_fmad_ftz:
9801 Op.getOperand(2),
Op.getOperand(3));
9803 case Intrinsic::amdgcn_if_break:
9805 Op->getOperand(1),
Op->getOperand(2)),
9808 case Intrinsic::amdgcn_groupstaticsize: {
9814 const GlobalValue *GV =
9820 case Intrinsic::amdgcn_is_shared:
9821 case Intrinsic::amdgcn_is_private: {
9824 DAG.
getNode(ISD::BITCAST,
DL, MVT::v2i32,
Op.getOperand(1));
9828 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9832 Subtarget->hasGloballyAddressableScratch()) {
9835 AMDGPU::S_MOV_B32,
DL, MVT::i32,
9836 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
9845 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
9848 case Intrinsic::amdgcn_perm:
9850 Op.getOperand(2),
Op.getOperand(3));
9851 case Intrinsic::amdgcn_reloc_constant: {
9861 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
9862 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
9863 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
9864 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
9865 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
9866 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
9867 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
9868 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
9869 if (
Op.getOperand(4).getValueType() == MVT::i32)
9875 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
9876 Op.getOperand(3), IndexKeyi32);
9878 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
9879 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
9880 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
9881 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
9882 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
9883 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
9884 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
9885 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
9886 if (
Op.getOperand(4).getValueType() == MVT::i64)
9892 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9893 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
9896 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
9897 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
9898 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
9899 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
9900 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
9901 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
9902 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
9905 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
9911 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9912 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9913 IndexKey, Op.getOperand(7),
9916 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
9917 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
9918 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
9919 if (
Op.getOperand(6).getValueType() == MVT::i32)
9925 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9926 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9927 IndexKeyi32, Op.getOperand(7)});
9929 case Intrinsic::amdgcn_addrspacecast_nonnull:
9930 return lowerADDRSPACECAST(
Op, DAG);
9931 case Intrinsic::amdgcn_readlane:
9932 case Intrinsic::amdgcn_readfirstlane:
9933 case Intrinsic::amdgcn_writelane:
9934 case Intrinsic::amdgcn_permlane16:
9935 case Intrinsic::amdgcn_permlanex16:
9936 case Intrinsic::amdgcn_permlane64:
9937 case Intrinsic::amdgcn_set_inactive:
9938 case Intrinsic::amdgcn_set_inactive_chain_arg:
9939 case Intrinsic::amdgcn_mov_dpp8:
9940 case Intrinsic::amdgcn_update_dpp:
9942 case Intrinsic::amdgcn_dead: {
9944 for (
const EVT ValTy :
Op.getNode()->values())
9949 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9951 return lowerImage(
Op, ImageDimIntr, DAG,
false);
9962 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
9968 unsigned NewOpcode)
const {
9972 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9973 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9991 M->getMemOperand());
9996 unsigned NewOpcode)
const {
10000 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10001 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10019 M->getMemOperand());
10024 unsigned IntrID =
Op.getConstantOperandVal(1);
10028 case Intrinsic::amdgcn_ds_ordered_add:
10029 case Intrinsic::amdgcn_ds_ordered_swap: {
10034 unsigned IndexOperand =
M->getConstantOperandVal(7);
10035 unsigned WaveRelease =
M->getConstantOperandVal(8);
10036 unsigned WaveDone =
M->getConstantOperandVal(9);
10038 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10039 IndexOperand &= ~0x3f;
10040 unsigned CountDw = 0;
10043 CountDw = (IndexOperand >> 24) & 0xf;
10044 IndexOperand &= ~(0xf << 24);
10046 if (CountDw < 1 || CountDw > 4) {
10049 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
10050 DL.getDebugLoc()));
10055 if (IndexOperand) {
10058 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
10061 if (WaveDone && !WaveRelease) {
10065 Fn,
"ds_ordered_count: wave_done requires wave_release",
10066 DL.getDebugLoc()));
10069 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10070 unsigned ShaderType =
10072 unsigned Offset0 = OrderedCountIndex << 2;
10073 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10076 Offset1 |= (CountDw - 1) << 6;
10079 Offset1 |= ShaderType << 2;
10081 unsigned Offset = Offset0 | (Offset1 << 8);
10088 M->getVTList(),
Ops,
M->getMemoryVT(),
10089 M->getMemOperand());
10091 case Intrinsic::amdgcn_raw_buffer_load:
10092 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10093 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10094 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10095 case Intrinsic::amdgcn_raw_buffer_load_format:
10096 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10097 const bool IsFormat =
10098 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10099 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10101 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10102 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10116 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
10118 case Intrinsic::amdgcn_struct_buffer_load:
10119 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10120 case Intrinsic::amdgcn_struct_buffer_load_format:
10121 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10122 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10123 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10124 const bool IsFormat =
10125 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10126 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10128 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10129 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10144 case Intrinsic::amdgcn_raw_tbuffer_load:
10145 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10147 EVT LoadVT =
Op.getValueType();
10148 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10149 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10168 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10171 case Intrinsic::amdgcn_struct_tbuffer_load:
10172 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10174 EVT LoadVT =
Op.getValueType();
10175 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10176 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10195 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10198 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10199 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10201 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10202 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10203 return lowerStructBufferAtomicIntrin(
Op, DAG,
10205 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10206 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10208 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10209 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10210 return lowerStructBufferAtomicIntrin(
Op, DAG,
10212 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10213 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10215 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10216 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10217 return lowerStructBufferAtomicIntrin(
Op, DAG,
10219 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10220 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10222 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10223 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10225 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10226 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10228 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10229 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10231 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10232 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10234 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10235 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10237 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10238 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10240 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10241 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10243 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10244 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10246 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10247 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10249 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10250 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10252 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10253 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10255 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10256 return lowerRawBufferAtomicIntrin(
Op, DAG,
10258 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10259 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10260 return lowerStructBufferAtomicIntrin(
Op, DAG,
10262 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10263 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10265 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10266 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10268 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10269 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10270 return lowerStructBufferAtomicIntrin(
Op, DAG,
10272 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10273 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10274 return lowerStructBufferAtomicIntrin(
Op, DAG,
10276 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10277 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10278 return lowerStructBufferAtomicIntrin(
Op, DAG,
10280 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10281 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10282 return lowerStructBufferAtomicIntrin(
Op, DAG,
10284 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10285 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10287 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10288 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10290 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10291 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10293 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10294 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10296 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10297 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10299 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10300 return lowerStructBufferAtomicIntrin(
Op, DAG,
10303 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10304 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10305 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
10306 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10320 EVT VT =
Op.getValueType();
10324 Op->getVTList(),
Ops, VT,
10325 M->getMemOperand());
10327 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10328 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10329 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
10330 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
10344 EVT VT =
Op.getValueType();
10348 Op->getVTList(),
Ops, VT,
10349 M->getMemOperand());
10351 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10352 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10354 SDValue NodePtr =
M->getOperand(2);
10355 SDValue RayExtent =
M->getOperand(3);
10356 SDValue InstanceMask =
M->getOperand(4);
10357 SDValue RayOrigin =
M->getOperand(5);
10358 SDValue RayDir =
M->getOperand(6);
10360 SDValue TDescr =
M->getOperand(8);
10365 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10370 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10371 const unsigned NumVDataDwords = 10;
10372 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10374 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10375 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10376 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10380 Ops.push_back(NodePtr);
10383 {DAG.getBitcast(MVT::i32, RayExtent),
10384 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10385 Ops.push_back(RayOrigin);
10386 Ops.push_back(RayDir);
10387 Ops.push_back(Offsets);
10388 Ops.push_back(TDescr);
10389 Ops.push_back(
M->getChain());
10392 MachineMemOperand *MemRef =
M->getMemOperand();
10396 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10398 SDValue NodePtr =
M->getOperand(2);
10399 SDValue RayExtent =
M->getOperand(3);
10400 SDValue RayOrigin =
M->getOperand(4);
10401 SDValue RayDir =
M->getOperand(5);
10402 SDValue RayInvDir =
M->getOperand(6);
10403 SDValue TDescr =
M->getOperand(7);
10410 if (!Subtarget->hasGFX10_AEncoding()) {
10420 const unsigned NumVDataDwords = 4;
10421 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10422 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10423 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10426 const unsigned BaseOpcodes[2][2] = {
10427 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10428 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10429 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10433 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10434 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10435 : AMDGPU::MIMGEncGfx10NSA,
10436 NumVDataDwords, NumVAddrDwords);
10440 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10441 : AMDGPU::MIMGEncGfx10Default,
10442 NumVDataDwords, NumVAddrDwords);
10448 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
10451 if (Lanes[0].getValueSizeInBits() == 32) {
10452 for (
unsigned I = 0;
I < 3; ++
I)
10459 Ops.push_back(Lanes[2]);
10471 if (UseNSA && IsGFX11Plus) {
10472 Ops.push_back(NodePtr);
10474 Ops.push_back(RayOrigin);
10479 for (
unsigned I = 0;
I < 3; ++
I) {
10482 {DirLanes[I], InvDirLanes[I]})));
10486 Ops.push_back(RayDir);
10487 Ops.push_back(RayInvDir);
10494 Ops.push_back(NodePtr);
10497 packLanes(RayOrigin,
true);
10498 packLanes(RayDir,
true);
10499 packLanes(RayInvDir,
false);
10504 if (NumVAddrDwords > 12) {
10506 Ops.append(16 -
Ops.size(), Undef);
10512 Ops.push_back(MergedOps);
10515 Ops.push_back(TDescr);
10517 Ops.push_back(
M->getChain());
10520 MachineMemOperand *MemRef =
M->getMemOperand();
10524 case Intrinsic::amdgcn_global_atomic_fmin_num:
10525 case Intrinsic::amdgcn_global_atomic_fmax_num:
10526 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10527 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10534 unsigned Opcode = 0;
10536 case Intrinsic::amdgcn_global_atomic_fmin_num:
10537 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10538 Opcode = ISD::ATOMIC_LOAD_FMIN;
10541 case Intrinsic::amdgcn_global_atomic_fmax_num:
10542 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10543 Opcode = ISD::ATOMIC_LOAD_FMAX;
10549 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
10550 Ops,
M->getMemOperand());
10552 case Intrinsic::amdgcn_s_get_barrier_state:
10553 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10560 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10561 BarID = (BarID >> 4) & 0x3F;
10562 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10565 Ops.push_back(Chain);
10567 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10568 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10576 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
10584 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10585 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10586 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10590 EVT VT =
Op->getValueType(0);
10596 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10598 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10606SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
10613 EVT VT = VTList.
VTs[0];
10616 bool IsTFE = VTList.
NumVTs == 3;
10619 unsigned NumOpDWords = NumValueDWords + 1;
10621 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
10622 MachineMemOperand *OpDWordsMMO =
10624 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
10625 OpDWordsVT, OpDWordsMMO, DAG);
10630 NumValueDWords == 1
10639 if (!Subtarget->hasDwordx3LoadStores() &&
10640 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10644 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
10646 WidenedMemVT, WidenedMMO);
10656 bool ImageStore)
const {
10666 if (Subtarget->hasUnpackedD16VMem()) {
10680 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10691 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
10697 if ((NumElements % 2) == 1) {
10699 unsigned I = Elts.
size() / 2;
10715 if (NumElements == 3) {
10725 return DAG.
getNode(ISD::BITCAST,
DL, WidenedStoreVT, ZExt);
10736 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
10739 switch (IntrinsicID) {
10740 case Intrinsic::amdgcn_exp_compr: {
10741 if (!Subtarget->hasCompressedExport()) {
10744 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
10756 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src0),
10757 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src1),
10766 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10770 case Intrinsic::amdgcn_struct_tbuffer_store:
10771 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10773 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
10775 VData = handleD16VData(VData, DAG);
10776 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10777 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10795 M->getMemoryVT(),
M->getMemOperand());
10798 case Intrinsic::amdgcn_raw_tbuffer_store:
10799 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
10801 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
10803 VData = handleD16VData(VData, DAG);
10804 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10805 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10823 M->getMemoryVT(),
M->getMemOperand());
10826 case Intrinsic::amdgcn_raw_buffer_store:
10827 case Intrinsic::amdgcn_raw_ptr_buffer_store:
10828 case Intrinsic::amdgcn_raw_buffer_store_format:
10829 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
10830 const bool IsFormat =
10831 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
10832 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
10839 VData = handleD16VData(VData, DAG);
10849 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10850 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10870 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
10873 M->getMemoryVT(),
M->getMemOperand());
10876 case Intrinsic::amdgcn_struct_buffer_store:
10877 case Intrinsic::amdgcn_struct_ptr_buffer_store:
10878 case Intrinsic::amdgcn_struct_buffer_store_format:
10879 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
10880 const bool IsFormat =
10881 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
10882 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
10890 VData = handleD16VData(VData, DAG);
10900 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10901 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10920 EVT VDataType = VData.getValueType().getScalarType();
10922 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
10925 M->getMemoryVT(),
M->getMemOperand());
10927 case Intrinsic::amdgcn_raw_buffer_load_lds:
10928 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
10929 case Intrinsic::amdgcn_struct_buffer_load_lds:
10930 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
10931 if (!Subtarget->hasVMemToLDSLoad())
10935 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
10936 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
10937 unsigned OpOffset = HasVIndex ? 1 : 0;
10938 SDValue VOffset =
Op.getOperand(5 + OpOffset);
10940 unsigned Size =
Op->getConstantOperandVal(4);
10946 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
10947 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
10948 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
10949 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
10952 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
10953 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
10954 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
10955 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
10958 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
10959 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
10960 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
10961 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
10964 if (!Subtarget->hasLDSLoadB96_B128())
10966 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
10967 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
10968 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
10969 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
10972 if (!Subtarget->hasLDSLoadB96_B128())
10974 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
10975 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
10976 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
10977 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
10985 if (HasVIndex && HasVOffset)
10989 else if (HasVIndex)
10990 Ops.push_back(
Op.getOperand(5));
10991 else if (HasVOffset)
10992 Ops.push_back(VOffset);
10994 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10995 Ops.push_back(Rsrc);
10996 Ops.push_back(
Op.getOperand(6 + OpOffset));
10997 Ops.push_back(
Op.getOperand(7 + OpOffset));
10999 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
11012 MachineMemOperand *LoadMMO =
M->getMemOperand();
11017 MachinePointerInfo StorePtrI = LoadPtrI;
11041 case Intrinsic::amdgcn_load_to_lds:
11042 case Intrinsic::amdgcn_global_load_lds: {
11043 if (!Subtarget->hasVMemToLDSLoad())
11047 unsigned Size =
Op->getConstantOperandVal(4);
11052 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11055 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11058 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11061 if (!Subtarget->hasLDSLoadB96_B128())
11063 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11066 if (!Subtarget->hasLDSLoadB96_B128())
11068 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11084 if (
LHS->isDivergent())
11088 RHS.getOperand(0).getValueType() == MVT::i32) {
11091 VOffset =
RHS.getOperand(0);
11095 Ops.push_back(Addr);
11103 Ops.push_back(VOffset);
11106 Ops.push_back(
Op.getOperand(5));
11107 Ops.push_back(
Op.getOperand(6));
11112 MachineMemOperand *LoadMMO =
M->getMemOperand();
11114 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
11115 MachinePointerInfo StorePtrI = LoadPtrI;
11134 case Intrinsic::amdgcn_end_cf:
11136 Op->getOperand(2), Chain),
11138 case Intrinsic::amdgcn_s_barrier_init:
11139 case Intrinsic::amdgcn_s_barrier_signal_var: {
11146 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11147 ? AMDGPU::S_BARRIER_INIT_M0
11148 : AMDGPU::S_BARRIER_SIGNAL_M0;
11163 constexpr unsigned ShAmt = 16;
11170 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11175 case Intrinsic::amdgcn_s_barrier_join: {
11184 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11187 unsigned BarID = (BarVal >> 4) & 0x3F;
11190 Ops.push_back(Chain);
11192 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11202 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11208 case Intrinsic::amdgcn_s_prefetch_data: {
11211 return Op.getOperand(0);
11214 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11216 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
11223 Op->getVTList(),
Ops,
M->getMemoryVT(),
11224 M->getMemOperand());
11226 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11227 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11228 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11237 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11239 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11264std::pair<SDValue, SDValue>
11294 unsigned Overflow = ImmOffset & ~MaxImm;
11295 ImmOffset -= Overflow;
11296 if ((int32_t)Overflow < 0) {
11297 Overflow += ImmOffset;
11302 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
11321void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
11323 Align Alignment)
const {
11325 SDLoc
DL(CombinedOffset);
11327 uint32_t
Imm =
C->getZExtValue();
11328 uint32_t SOffset, ImmOffset;
11329 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11339 uint32_t SOffset, ImmOffset;
11342 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
11350 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11359SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
11362 return MaybePointer;
11376 SDValue NumRecords =
Op->getOperand(3);
11379 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11382 std::optional<uint32_t> ConstStride = std::nullopt;
11384 ConstStride = ConstNode->getZExtValue();
11387 if (!ConstStride || *ConstStride != 0) {
11390 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
11401 NewHighHalf, NumRecords, Flags);
11402 SDValue RsrcPtr = DAG.
getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11411 bool IsTFE)
const {
11420 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
11435 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
11439 LoadVal = DAG.
getNode(ISD::BITCAST,
DL, LoadVT, LoadVal);
11449 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11453 Ops[1] = BufferStoreExt;
11458 M->getMemOperand());
11483 DAGCombinerInfo &DCI)
const {
11484 SelectionDAG &DAG = DCI.DAG;
11499 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
11506 "unexpected vector extload");
11519 "unexpected fp extload");
11537 DCI.AddToWorklist(Cvt.
getNode());
11542 DCI.AddToWorklist(Cvt.
getNode());
11545 Cvt = DAG.
getNode(ISD::BITCAST, SL, VT, Cvt);
11553 if (
Info.isEntryFunction())
11554 return Info.getUserSGPRInfo().hasFlatScratchInit();
11562 EVT MemVT =
Load->getMemoryVT();
11563 MachineMemOperand *MMO =
Load->getMemOperand();
11575 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11603 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
11604 "Custom lowering for non-i32 vectors hasn't been implemented.");
11607 unsigned AS =
Load->getAddressSpace();
11614 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
11618 !Subtarget->hasMultiDwordFlatScratchAddressing())
11628 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
11631 Alignment >=
Align(4) && NumElements < 32) {
11633 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11645 if (NumElements > 4)
11648 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11658 switch (Subtarget->getMaxPrivateElementSize()) {
11664 if (NumElements > 2)
11669 if (NumElements > 4)
11672 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11681 auto Flags =
Load->getMemOperand()->getFlags();
11683 Load->getAlign(), Flags, &
Fast) &&
11692 MemVT, *
Load->getMemOperand())) {
11701 EVT VT =
Op.getValueType();
11728 return DAG.
getNode(ISD::BITCAST,
DL, VT, Res);
11738 EVT VT =
Op.getValueType();
11739 const SDNodeFlags
Flags =
Op->getFlags();
11741 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
11747 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11750 if (CLHS->isExactlyValue(1.0)) {
11767 if (CLHS->isExactlyValue(-1.0)) {
11776 if (!AllowInaccurateRcp &&
11777 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
11791 EVT VT =
Op.getValueType();
11792 const SDNodeFlags
Flags =
Op->getFlags();
11794 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
11795 if (!AllowInaccurateDiv)
11816 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
11830 return DAG.
getNode(Opcode, SL, VTList,
11839 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
11853 return DAG.
getNode(Opcode, SL, VTList,
11859 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
11860 return FastLowered;
11863 EVT VT =
Op.getValueType();
11870 if (VT == MVT::bf16) {
11893 unsigned FMADOpCode =
11895 SDValue NegRHSExt = DAG.
getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
11900 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11902 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
11903 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11909 Tmp = DAG.
getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
11919 SDNodeFlags
Flags =
Op->getFlags();
11926 const APFloat K0Val(0x1p+96f);
11929 const APFloat K1Val(0x1p-32f);
11956 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
11957 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
11958 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
11963 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
11964 return FastLowered;
11970 SDNodeFlags
Flags =
Op->getFlags();
11971 Flags.setNoFPExcept(
true);
11979 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
11990 DAG.
getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
11992 using namespace AMDGPU::Hwreg;
11993 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
11997 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
11998 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
12001 const bool HasDynamicDenormals =
12007 if (!PreservesDenormals) {
12012 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12015 if (HasDynamicDenormals) {
12019 SavedDenormMode =
SDValue(GetReg, 0);
12025 SDNode *EnableDenorm;
12026 if (Subtarget->hasDenormModeInst()) {
12027 const SDValue EnableDenormValue =
12034 const SDValue EnableDenormValue =
12036 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12037 {EnableDenormValue,
BitField, Glue});
12047 ApproxRcp, One, NegDivScale0, Flags);
12050 ApproxRcp, Fma0, Flags);
12056 NumeratorScaled,
Mul, Flags);
12062 NumeratorScaled, Fma3, Flags);
12064 if (!PreservesDenormals) {
12065 SDNode *DisableDenorm;
12066 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12070 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12076 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
12077 const SDValue DisableDenormValue =
12078 HasDynamicDenormals
12083 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12094 {Fma4, Fma1, Fma3, Scale},
Flags);
12100 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
12101 return FastLowered;
12109 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
12113 SDValue NegDivScale0 = DAG.
getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12133 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12142 SDValue Scale0BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12143 SDValue Scale1BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12169 EVT VT =
Op.getValueType();
12171 if (VT == MVT::f32)
12172 return LowerFDIV32(
Op, DAG);
12174 if (VT == MVT::f64)
12175 return LowerFDIV64(
Op, DAG);
12177 if (VT == MVT::f16 || VT == MVT::bf16)
12178 return LowerFDIV16(
Op, DAG);
12187 EVT ResultExpVT =
Op->getValueType(1);
12188 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12198 if (Subtarget->hasFractBug()) {
12216 EVT VT =
Store->getMemoryVT();
12218 if (VT == MVT::i1) {
12222 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
12226 Store->getValue().getValueType().getScalarType() == MVT::i32);
12228 unsigned AS =
Store->getAddressSpace();
12236 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12240 !Subtarget->hasMultiDwordFlatScratchAddressing())
12247 if (NumElements > 4)
12250 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12254 VT, *
Store->getMemOperand()))
12260 switch (Subtarget->getMaxPrivateElementSize()) {
12264 if (NumElements > 2)
12268 if (NumElements > 4 ||
12269 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12277 auto Flags =
Store->getMemOperand()->getFlags();
12296 assert(!Subtarget->has16BitInsts());
12297 SDNodeFlags
Flags =
Op->getFlags();
12299 DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32,
Op.getOperand(0), Flags);
12311 SDNodeFlags
Flags =
Op->getFlags();
12312 MVT VT =
Op.getValueType().getSimpleVT();
12342 SDValue SqrtSNextDown = DAG.
getNode(ISD::BITCAST,
DL, VT, SqrtSNextDownInt);
12345 DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextDown, Flags);
12354 SDValue NegSqrtSNextUp = DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextUp, Flags);
12420 SDNodeFlags
Flags =
Op->getFlags();
12466 SqrtRet = DAG.
getNode(ISD::FLDEXP,
DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12483 EVT VT =
Op.getValueType();
12493 if (Subtarget->hasTrigReducedRange()) {
12500 switch (
Op.getOpcode()) {
12527 EVT VT =
Op.getValueType();
12535 Op->getVTList(),
Ops, VT,
12544SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
12545 DAGCombinerInfo &DCI)
const {
12546 EVT VT =
N->getValueType(0);
12548 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12551 SelectionDAG &DAG = DCI.DAG;
12555 EVT SrcVT = Src.getValueType();
12561 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12564 DCI.AddToWorklist(Cvt.
getNode());
12567 if (ScalarVT != MVT::f32) {
12579 DAGCombinerInfo &DCI)
const {
12586 if (SignOp.
getOpcode() == ISD::FP_EXTEND ||
12590 SelectionDAG &DAG = DCI.DAG;
12609 for (
unsigned I = 0;
I != NumElts; ++
I) {
12633 if (NewElts.
size() == 1)
12655 for (
unsigned I = 0;
I != NumElts; ++
I) {
12690SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
12692 DAGCombinerInfo &DCI)
const {
12710 SelectionDAG &DAG = DCI.DAG;
12723 AM.BaseOffs =
Offset.getSExtValue();
12728 EVT VT =
N->getValueType(0);
12734 Flags.setNoUnsignedWrap(
12735 N->getFlags().hasNoUnsignedWrap() &&
12745 switch (
N->getOpcode()) {
12756 DAGCombinerInfo &DCI)
const {
12757 SelectionDAG &DAG = DCI.DAG;
12764 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
12765 N->getMemoryVT(), DCI);
12769 NewOps[PtrIdx] = NewPtr;
12778 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
12779 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
12788SDValue SITargetLowering::splitBinaryBitConstantOp(
12792 uint32_t ValLo =
Lo_32(Val);
12793 uint32_t ValHi =
Hi_32(Val);
12800 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
12814 if (V.getValueType() != MVT::i1)
12816 switch (V.getOpcode()) {
12833 return V.getResNo() == 1;
12835 unsigned IntrinsicID = V.getConstantOperandVal(0);
12836 switch (IntrinsicID) {
12837 case Intrinsic::amdgcn_is_shared:
12838 case Intrinsic::amdgcn_is_private:
12855 if (!(
C & 0x000000ff))
12856 ZeroByteMask |= 0x000000ff;
12857 if (!(
C & 0x0000ff00))
12858 ZeroByteMask |= 0x0000ff00;
12859 if (!(
C & 0x00ff0000))
12860 ZeroByteMask |= 0x00ff0000;
12861 if (!(
C & 0xff000000))
12862 ZeroByteMask |= 0xff000000;
12863 uint32_t NonZeroByteMask = ~ZeroByteMask;
12864 if ((NonZeroByteMask &
C) != NonZeroByteMask)
12877 assert(V.getValueSizeInBits() == 32);
12879 if (V.getNumOperands() != 2)
12888 switch (V.getOpcode()) {
12893 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
12898 return (0x03020100 & ~ConstMask) | ConstMask;
12905 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
12911 return uint32_t(0x0c0c0c0c03020100ull >>
C);
12918 DAGCombinerInfo &DCI)
const {
12919 if (DCI.isBeforeLegalize())
12922 SelectionDAG &DAG = DCI.DAG;
12923 EVT VT =
N->getValueType(0);
12928 if (VT == MVT::i64 && CRHS) {
12930 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
12934 if (CRHS && VT == MVT::i32) {
12944 unsigned Shift = CShift->getZExtValue();
12946 unsigned Offset = NB + Shift;
12947 if ((
Offset & (Bits - 1)) == 0) {
12971 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
12986 if (
Y.getOpcode() != ISD::FABS ||
Y.getOperand(0) !=
X ||
12991 if (
X !=
LHS.getOperand(1))
12995 const ConstantFPSDNode *C1 =
13029 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
13030 LHS.getOperand(0) ==
LHS.getOperand(1))) {
13032 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
13033 :
Mask->getZExtValue() & OrdMask;
13054 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13057 if (LHSMask != ~0u && RHSMask != ~0u) {
13060 if (LHSMask > RHSMask) {
13067 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13068 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13071 if (!(LHSUsedLanes & RHSUsedLanes) &&
13074 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13080 uint32_t
Mask = LHSMask & RHSMask;
13081 for (
unsigned I = 0;
I < 32;
I += 8) {
13082 uint32_t ByteSel = 0xff <<
I;
13083 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13084 Mask &= (0x0c <<
I) & 0xffffffff;
13089 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
13142static const std::optional<ByteProvider<SDValue>>
13144 unsigned Depth = 0) {
13147 return std::nullopt;
13149 if (
Op.getValueSizeInBits() < 8)
13150 return std::nullopt;
13152 if (
Op.getValueType().isVector())
13155 switch (
Op->getOpcode()) {
13167 NarrowVT = VTSign->getVT();
13170 return std::nullopt;
13173 if (SrcIndex >= NarrowByteWidth)
13174 return std::nullopt;
13182 return std::nullopt;
13184 uint64_t BitShift = ShiftOp->getZExtValue();
13186 if (BitShift % 8 != 0)
13187 return std::nullopt;
13189 SrcIndex += BitShift / 8;
13207static const std::optional<ByteProvider<SDValue>>
13209 unsigned StartingIndex = 0) {
13213 return std::nullopt;
13215 unsigned BitWidth =
Op.getScalarValueSizeInBits();
13217 return std::nullopt;
13219 return std::nullopt;
13221 bool IsVec =
Op.getValueType().isVector();
13222 switch (
Op.getOpcode()) {
13225 return std::nullopt;
13230 return std::nullopt;
13234 return std::nullopt;
13237 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
13238 return std::nullopt;
13239 if (!
LHS ||
LHS->isConstantZero())
13241 if (!
RHS ||
RHS->isConstantZero())
13243 return std::nullopt;
13248 return std::nullopt;
13252 return std::nullopt;
13254 uint32_t BitMask = BitMaskOp->getZExtValue();
13256 uint32_t IndexMask = 0xFF << (Index * 8);
13258 if ((IndexMask & BitMask) != IndexMask) {
13261 if (IndexMask & BitMask)
13262 return std::nullopt;
13271 return std::nullopt;
13275 if (!ShiftOp ||
Op.getValueType().isVector())
13276 return std::nullopt;
13278 uint64_t BitsProvided =
Op.getValueSizeInBits();
13279 if (BitsProvided % 8 != 0)
13280 return std::nullopt;
13282 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13284 return std::nullopt;
13286 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13287 uint64_t ByteShift = BitShift / 8;
13289 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13290 uint64_t BytesProvided = BitsProvided / 8;
13291 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13292 NewIndex %= BytesProvided;
13299 return std::nullopt;
13303 return std::nullopt;
13305 uint64_t BitShift = ShiftOp->getZExtValue();
13307 return std::nullopt;
13309 auto BitsProvided =
Op.getScalarValueSizeInBits();
13310 if (BitsProvided % 8 != 0)
13311 return std::nullopt;
13313 uint64_t BytesProvided = BitsProvided / 8;
13314 uint64_t ByteShift = BitShift / 8;
13319 return BytesProvided - ByteShift > Index
13327 return std::nullopt;
13331 return std::nullopt;
13333 uint64_t BitShift = ShiftOp->getZExtValue();
13334 if (BitShift % 8 != 0)
13335 return std::nullopt;
13336 uint64_t ByteShift = BitShift / 8;
13342 return Index < ByteShift
13345 Depth + 1, StartingIndex);
13354 return std::nullopt;
13362 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13364 if (NarrowBitWidth % 8 != 0)
13365 return std::nullopt;
13366 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13368 if (Index >= NarrowByteWidth)
13370 ? std::optional<ByteProvider<SDValue>>(
13378 return std::nullopt;
13382 if (NarrowByteWidth >= Index) {
13387 return std::nullopt;
13394 return std::nullopt;
13400 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13401 if (NarrowBitWidth % 8 != 0)
13402 return std::nullopt;
13403 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13408 if (Index >= NarrowByteWidth) {
13410 ? std::optional<ByteProvider<SDValue>>(
13415 if (NarrowByteWidth > Index) {
13419 return std::nullopt;
13424 return std::nullopt;
13427 Depth + 1, StartingIndex);
13433 return std::nullopt;
13434 auto VecIdx = IdxOp->getZExtValue();
13435 auto ScalarSize =
Op.getScalarValueSizeInBits();
13436 if (ScalarSize < 32)
13437 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13439 StartingIndex, Index);
13444 return std::nullopt;
13448 return std::nullopt;
13451 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13452 if (IdxMask > 0x07 && IdxMask != 0x0c)
13453 return std::nullopt;
13455 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13456 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13458 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
13464 return std::nullopt;
13479 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
13486 auto MemVT = L->getMemoryVT();
13489 return L->getMemoryVT().getSizeInBits() == 16;
13499 int Low8 = Mask & 0xff;
13500 int Hi8 = (Mask & 0xff00) >> 8;
13502 assert(Low8 < 8 && Hi8 < 8);
13504 bool IsConsecutive = (Hi8 - Low8 == 1);
13509 bool Is16Aligned = !(Low8 % 2);
13511 return IsConsecutive && Is16Aligned;
13519 int Low16 = PermMask & 0xffff;
13520 int Hi16 = (PermMask & 0xffff0000) >> 16;
13530 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13532 if (!OtherOpIs16Bit)
13540 unsigned DWordOffset) {
13545 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13550 if (Src.getValueType().isVector()) {
13551 auto ScalarTySize = Src.getScalarValueSizeInBits();
13552 auto ScalarTy = Src.getValueType().getScalarType();
13553 if (ScalarTySize == 32) {
13557 if (ScalarTySize > 32) {
13560 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13561 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13568 assert(ScalarTySize < 32);
13569 auto NumElements =
TypeSize / ScalarTySize;
13570 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13571 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13572 auto NumElementsIn32 = 32 / ScalarTySize;
13573 auto NumAvailElements = DWordOffset < Trunc32Elements
13575 : NumElements - NormalizedTrunc;
13588 auto ShiftVal = 32 * DWordOffset;
13596 [[maybe_unused]]
EVT VT =
N->getValueType(0);
13601 for (
int i = 0; i < 4; i++) {
13603 std::optional<ByteProvider<SDValue>>
P =
13606 if (!
P ||
P->isConstantZero())
13611 if (PermNodes.
size() != 4)
13614 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13615 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13617 for (
size_t i = 0; i < PermNodes.
size(); i++) {
13618 auto PermOp = PermNodes[i];
13621 int SrcByteAdjust = 4;
13625 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13626 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13628 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13629 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13633 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13634 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13637 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13639 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13642 SDValue Op = *PermNodes[FirstSrc.first].Src;
13644 assert(
Op.getValueSizeInBits() == 32);
13648 int Low16 = PermMask & 0xffff;
13649 int Hi16 = (PermMask & 0xffff0000) >> 16;
13651 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13652 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13655 if (WellFormedLow && WellFormedHi)
13659 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
13668 assert(
Op.getValueType().isByteSized() &&
13686 DAGCombinerInfo &DCI)
const {
13687 SelectionDAG &DAG = DCI.DAG;
13691 EVT VT =
N->getValueType(0);
13692 if (VT == MVT::i1) {
13697 if (Src !=
RHS.getOperand(0))
13702 if (!CLHS || !CRHS)
13706 static const uint32_t MaxMask = 0x3ff;
13726 Sel |=
LHS.getConstantOperandVal(2);
13735 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13739 auto usesCombinedOperand = [](SDNode *OrUse) {
13741 if (OrUse->getOpcode() != ISD::BITCAST ||
13742 !OrUse->getValueType(0).isVector())
13746 for (
auto *VUser : OrUse->users()) {
13747 if (!VUser->getValueType(0).isVector())
13754 if (VUser->getOpcode() == VectorwiseOp)
13760 if (!
any_of(
N->users(), usesCombinedOperand))
13766 if (LHSMask != ~0u && RHSMask != ~0u) {
13769 if (LHSMask > RHSMask) {
13776 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13777 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13780 if (!(LHSUsedLanes & RHSUsedLanes) &&
13783 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13785 LHSMask &= ~RHSUsedLanes;
13786 RHSMask &= ~LHSUsedLanes;
13788 LHSMask |= LHSUsedLanes & 0x04040404;
13790 uint32_t Sel = LHSMask | RHSMask;
13798 if (LHSMask == ~0u || RHSMask == ~0u) {
13804 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
13819 if (SrcVT == MVT::i32) {
13824 DCI.AddToWorklist(LowOr.
getNode());
13825 DCI.AddToWorklist(HiBits.getNode());
13829 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
13836 N->getOperand(0), CRHS))
13844 DAGCombinerInfo &DCI)
const {
13845 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
13852 SelectionDAG &DAG = DCI.DAG;
13854 EVT VT =
N->getValueType(0);
13855 if (CRHS && VT == MVT::i64) {
13857 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
13871 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(1));
13873 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(2));
13877 LHS->getOperand(0), FNegLHS, FNegRHS);
13878 return DAG.
getNode(ISD::BITCAST,
DL, VT, NewSelect);
13886 DAGCombinerInfo &DCI)
const {
13887 if (!Subtarget->has16BitInsts() ||
13891 EVT VT =
N->getValueType(0);
13892 if (VT != MVT::i32)
13896 if (Src.getValueType() != MVT::i16)
13903SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
13904 DAGCombinerInfo &DCI)
const {
13911 VTSign->getVT() == MVT::i8) ||
13913 VTSign->getVT() == MVT::i16))) {
13914 assert(Subtarget->hasScalarSubwordLoads() &&
13915 "s_buffer_load_{u8, i8} are supported "
13916 "in GFX12 (or newer) architectures.");
13917 EVT VT = Src.getValueType();
13922 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
13929 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
13930 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
13935 VTSign->getVT() == MVT::i8) ||
13937 VTSign->getVT() == MVT::i16)) &&
13946 Src.getOperand(6), Src.getOperand(7)};
13949 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
13953 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
13954 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
13955 return DCI.DAG.getMergeValues(
13956 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
13962 DAGCombinerInfo &DCI)
const {
13963 SelectionDAG &DAG = DCI.DAG;
13970 if (
N->getOperand(0).isUndef())
13977 DAGCombinerInfo &DCI)
const {
13978 EVT VT =
N->getValueType(0);
13993 if ((VT == MVT::f16 && N0.
getOpcode() == ISD::FSQRT) &&
14003 unsigned MaxDepth)
const {
14004 unsigned Opcode =
Op.getOpcode();
14009 const auto &
F = CFP->getValueAPF();
14010 if (
F.isNaN() &&
F.isSignaling())
14012 if (!
F.isDenormal())
14038 case ISD::FP_EXTEND:
14039 case ISD::FP16_TO_FP:
14040 case ISD::FP_TO_FP16:
14041 case ISD::BF16_TO_FP:
14042 case ISD::FP_TO_BF16:
14075 if (
Op.getValueType() == MVT::i32) {
14081 if (RHS->getZExtValue() == 0xffff0000) {
14091 return Op.getValueType().getScalarType() != MVT::f16;
14095 case ISD::FMINNUM_IEEE:
14096 case ISD::FMAXNUM_IEEE:
14097 case ISD::FMINIMUM:
14098 case ISD::FMAXIMUM:
14099 case ISD::FMINIMUMNUM:
14100 case ISD::FMAXIMUMNUM:
14112 if (Subtarget->supportsMinMaxDenormModes() ||
14122 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
14134 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
14161 if (
Op.getValueType() == MVT::i16) {
14164 TruncSrc.
getOpcode() == ISD::BITCAST &&
14172 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
14174 switch (IntrinsicID) {
14175 case Intrinsic::amdgcn_cvt_pkrtz:
14176 case Intrinsic::amdgcn_cubeid:
14177 case Intrinsic::amdgcn_frexp_mant:
14178 case Intrinsic::amdgcn_fdot2:
14179 case Intrinsic::amdgcn_rcp:
14180 case Intrinsic::amdgcn_rsq:
14181 case Intrinsic::amdgcn_rsq_clamp:
14182 case Intrinsic::amdgcn_rcp_legacy:
14183 case Intrinsic::amdgcn_rsq_legacy:
14184 case Intrinsic::amdgcn_trig_preop:
14185 case Intrinsic::amdgcn_tanh:
14186 case Intrinsic::amdgcn_log:
14187 case Intrinsic::amdgcn_exp2:
14188 case Intrinsic::amdgcn_sqrt:
14206 unsigned MaxDepth)
const {
14209 unsigned Opcode =
MI->getOpcode();
14211 if (Opcode == AMDGPU::G_FCANONICALIZE)
14214 std::optional<FPValueAndVReg> FCR;
14217 if (FCR->Value.isSignaling())
14219 if (!FCR->Value.isDenormal())
14230 case AMDGPU::G_FADD:
14231 case AMDGPU::G_FSUB:
14232 case AMDGPU::G_FMUL:
14233 case AMDGPU::G_FCEIL:
14234 case AMDGPU::G_FFLOOR:
14235 case AMDGPU::G_FRINT:
14236 case AMDGPU::G_FNEARBYINT:
14237 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14238 case AMDGPU::G_INTRINSIC_TRUNC:
14239 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14240 case AMDGPU::G_FMA:
14241 case AMDGPU::G_FMAD:
14242 case AMDGPU::G_FSQRT:
14243 case AMDGPU::G_FDIV:
14244 case AMDGPU::G_FREM:
14245 case AMDGPU::G_FPOW:
14246 case AMDGPU::G_FPEXT:
14247 case AMDGPU::G_FLOG:
14248 case AMDGPU::G_FLOG2:
14249 case AMDGPU::G_FLOG10:
14250 case AMDGPU::G_FPTRUNC:
14251 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14252 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14253 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14254 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14255 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14257 case AMDGPU::G_FNEG:
14258 case AMDGPU::G_FABS:
14259 case AMDGPU::G_FCOPYSIGN:
14261 case AMDGPU::G_FMINNUM:
14262 case AMDGPU::G_FMAXNUM:
14263 case AMDGPU::G_FMINNUM_IEEE:
14264 case AMDGPU::G_FMAXNUM_IEEE:
14265 case AMDGPU::G_FMINIMUM:
14266 case AMDGPU::G_FMAXIMUM:
14267 case AMDGPU::G_FMINIMUMNUM:
14268 case AMDGPU::G_FMAXIMUMNUM: {
14269 if (Subtarget->supportsMinMaxDenormModes() ||
14276 case AMDGPU::G_BUILD_VECTOR:
14281 case AMDGPU::G_INTRINSIC:
14282 case AMDGPU::G_INTRINSIC_CONVERGENT:
14284 case Intrinsic::amdgcn_fmul_legacy:
14285 case Intrinsic::amdgcn_fmad_ftz:
14286 case Intrinsic::amdgcn_sqrt:
14287 case Intrinsic::amdgcn_fmed3:
14288 case Intrinsic::amdgcn_sin:
14289 case Intrinsic::amdgcn_cos:
14290 case Intrinsic::amdgcn_log:
14291 case Intrinsic::amdgcn_exp2:
14292 case Intrinsic::amdgcn_log_clamp:
14293 case Intrinsic::amdgcn_rcp:
14294 case Intrinsic::amdgcn_rcp_legacy:
14295 case Intrinsic::amdgcn_rsq:
14296 case Intrinsic::amdgcn_rsq_clamp:
14297 case Intrinsic::amdgcn_rsq_legacy:
14298 case Intrinsic::amdgcn_div_scale:
14299 case Intrinsic::amdgcn_div_fmas:
14300 case Intrinsic::amdgcn_div_fixup:
14301 case Intrinsic::amdgcn_fract:
14302 case Intrinsic::amdgcn_cvt_pkrtz:
14303 case Intrinsic::amdgcn_cubeid:
14304 case Intrinsic::amdgcn_cubema:
14305 case Intrinsic::amdgcn_cubesc:
14306 case Intrinsic::amdgcn_cubetc:
14307 case Intrinsic::amdgcn_frexp_mant:
14308 case Intrinsic::amdgcn_fdot2:
14309 case Intrinsic::amdgcn_trig_preop:
14310 case Intrinsic::amdgcn_tanh:
14329 if (
C.isDenormal()) {
14343 if (
C.isSignaling()) {
14366SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
14367 DAGCombinerInfo &DCI)
const {
14368 SelectionDAG &DAG = DCI.DAG;
14370 EVT VT =
N->getValueType(0);
14379 EVT VT =
N->getValueType(0);
14380 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
14396 EVT EltVT =
Lo.getValueType();
14399 for (
unsigned I = 0;
I != 2; ++
I) {
14403 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14404 }
else if (
Op.isUndef()) {
14438 case ISD::FMAXNUM_IEEE:
14439 case ISD::FMAXIMUMNUM:
14441 case ISD::FMAXIMUM:
14448 case ISD::FMINNUM_IEEE:
14449 case ISD::FMINIMUMNUM:
14451 case ISD::FMINIMUM:
14477 if (!MinK || !MaxK)
14490 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14491 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14550 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
14556 if (
Info->getMode().DX10Clamp) {
14565 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14593 case ISD::FMINNUM_IEEE:
14594 case ISD::FMAXNUM_IEEE:
14595 case ISD::FMINIMUMNUM:
14596 case ISD::FMAXIMUMNUM:
14599 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
14601 case ISD::FMINIMUM:
14602 case ISD::FMAXIMUM:
14610 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
14619 DAGCombinerInfo &DCI)
const {
14620 SelectionDAG &DAG = DCI.DAG;
14652 if (
SDValue Med3 = performIntMed3ImmCombine(
14657 if (
SDValue Med3 = performIntMed3ImmCombine(
14663 if (
SDValue Med3 = performIntMed3ImmCombine(
14668 if (
SDValue Med3 = performIntMed3ImmCombine(
14678 if (((
Opc == ISD::FMINNUM && Op0.
getOpcode() == ISD::FMAXNUM) ||
14679 (
Opc == ISD::FMINNUM_IEEE && Op0.
getOpcode() == ISD::FMAXNUM_IEEE) ||
14680 (
Opc == ISD::FMINIMUMNUM && Op0.
getOpcode() == ISD::FMAXIMUMNUM) ||
14683 (VT == MVT::f32 || VT == MVT::f64 ||
14684 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14685 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14686 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14687 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14689 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1))
14696 const SDNodeFlags
Flags =
N->getFlags();
14697 if ((
Opc == ISD::FMINIMUM ||
Opc == ISD::FMAXIMUM) &&
14698 !Subtarget->hasIEEEMinimumMaximumInsts() &&
Flags.hasNoNaNs()) {
14700 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14701 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
14711 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
14712 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
14721 DAGCombinerInfo &DCI)
const {
14722 EVT VT =
N->getValueType(0);
14726 SelectionDAG &DAG = DCI.DAG;
14741 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
14745 if (
Info->getMode().DX10Clamp) {
14765 DAGCombinerInfo &DCI)
const {
14769 return DCI.DAG.getUNDEF(
N->getValueType(0));
14777 bool IsDivergentIdx,
14782 unsigned VecSize = EltSize * NumElem;
14785 if (VecSize <= 64 && EltSize < 32)
14794 if (IsDivergentIdx)
14798 unsigned NumInsts = NumElem +
14799 ((EltSize + 31) / 32) * NumElem ;
14803 if (Subtarget->useVGPRIndexMode())
14804 return NumInsts <= 16;
14808 if (Subtarget->hasMovrel())
14809 return NumInsts <= 15;
14815 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
14830SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
14831 DAGCombinerInfo &DCI)
const {
14837 EVT ResVT =
N->getValueType(0);
14856 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
14875 case ISD::FMAXNUM_IEEE:
14876 case ISD::FMINNUM_IEEE:
14877 case ISD::FMAXIMUM:
14878 case ISD::FMINIMUM: {
14884 DCI.AddToWorklist(Elt0.
getNode());
14885 DCI.AddToWorklist(Elt1.
getNode());
14907 if (!DCI.isBeforeLegalize())
14915 VecSize > 32 && VecSize % 32 == 0 && Idx) {
14918 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
14919 unsigned EltIdx = BitIndex / 32;
14920 unsigned LeftoverBitIdx = BitIndex % 32;
14924 DCI.AddToWorklist(Cast.
getNode());
14928 DCI.AddToWorklist(Elt.
getNode());
14931 DCI.AddToWorklist(Srl.
getNode());
14935 DCI.AddToWorklist(Trunc.
getNode());
14937 if (VecEltVT == ResVT) {
14938 return DAG.
getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
14949SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
14950 DAGCombinerInfo &DCI)
const {
14961 SelectionDAG &DAG = DCI.DAG;
14980 if (Src.getOpcode() == ISD::FP_EXTEND &&
14981 Src.getOperand(0).getValueType() == MVT::f16) {
14982 return Src.getOperand(0);
14986 APFloat Val = CFP->getValueAPF();
14987 bool LosesInfo =
true;
14997 DAGCombinerInfo &DCI)
const {
14998 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
14999 "combine only useful on gfx8");
15001 SDValue TruncSrc =
N->getOperand(0);
15002 EVT VT =
N->getValueType(0);
15003 if (VT != MVT::f16)
15010 SelectionDAG &DAG = DCI.DAG;
15038 return DAG.
getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15041unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
15043 const SDNode *N1)
const {
15048 if (((VT == MVT::f32 &&
15050 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15070 EVT VT =
N->getValueType(0);
15071 if (VT != MVT::i32 && VT != MVT::i64)
15077 unsigned Opc =
N->getOpcode();
15132 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
15151 DAGCombinerInfo &DCI)
const {
15154 SelectionDAG &DAG = DCI.DAG;
15155 EVT VT =
N->getValueType(0);
15165 if (!
N->isDivergent() && Subtarget->hasSMulHi())
15169 if (NumBits <= 32 || NumBits > 64)
15180 if (!Subtarget->hasFullRate64Ops()) {
15181 unsigned NumUsers = 0;
15182 for (SDNode *User :
LHS->
users()) {
15185 if (!
User->isAnyAdd())
15209 bool MulSignedLo =
false;
15210 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15219 if (VT != MVT::i64) {
15242 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15244 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15245 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15247 if (!MulLHSUnsigned32) {
15254 if (!MulRHSUnsigned32) {
15265 if (VT != MVT::i64)
15271SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
15272 DAGCombinerInfo &DCI)
const {
15282 SelectionDAG &DAG = DCI.DAG;
15297 unsigned Opcode =
N->getOpcode();
15298 if (Opcode == ISD::PTRADD)
15301 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
15312static std::optional<ByteProvider<SDValue>>
15315 if (!Byte0 || Byte0->isConstantZero()) {
15316 return std::nullopt;
15319 if (Byte1 && !Byte1->isConstantZero()) {
15320 return std::nullopt;
15326 unsigned FirstCs =
First & 0x0c0c0c0c;
15327 unsigned SecondCs = Second & 0x0c0c0c0c;
15328 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
15329 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15331 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15332 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15333 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15334 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15336 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15360 for (
int BPI = 0; BPI < 2; BPI++) {
15363 BPP = {Src1, Src0};
15365 unsigned ZeroMask = 0x0c0c0c0c;
15366 unsigned FMask = 0xFF << (8 * (3 - Step));
15368 unsigned FirstMask =
15369 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15370 unsigned SecondMask =
15371 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15375 int FirstGroup = -1;
15376 for (
int I = 0;
I < 2;
I++) {
15378 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
15379 return IterElt.SrcOp == *BPP.first.Src &&
15380 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15384 if (Match != Srcs.
end()) {
15385 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
15390 if (FirstGroup != -1) {
15392 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
15393 return IterElt.SrcOp == *BPP.second.Src &&
15394 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15397 if (Match != Srcs.
end()) {
15398 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
15400 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15408 unsigned ZeroMask = 0x0c0c0c0c;
15409 unsigned FMask = 0xFF << (8 * (3 - Step));
15413 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15417 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15426 if (Srcs.
size() == 1) {
15427 auto *Elt = Srcs.
begin();
15431 if (Elt->PermMask == 0x3020100)
15438 auto *FirstElt = Srcs.
begin();
15439 auto *SecondElt = std::next(FirstElt);
15446 auto FirstMask = FirstElt->PermMask;
15447 auto SecondMask = SecondElt->PermMask;
15449 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15450 unsigned FirstPlusFour = FirstMask | 0x04040404;
15453 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15465 FirstElt = std::next(SecondElt);
15466 if (FirstElt == Srcs.
end())
15469 SecondElt = std::next(FirstElt);
15472 if (SecondElt == Srcs.
end()) {
15478 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
15484 return Perms.
size() == 2
15490 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15491 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15492 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15493 EntryMask += ZeroMask;
15498 auto Opcode =
Op.getOpcode();
15504static std::optional<bool>
15515 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15518 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15520 assert(!(S0IsUnsigned && S0IsSigned));
15521 assert(!(S1IsUnsigned && S1IsSigned));
15529 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15535 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15536 return std::nullopt;
15548 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15549 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15554 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15560 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15561 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15562 return std::nullopt;
15568 DAGCombinerInfo &DCI)
const {
15569 SelectionDAG &DAG = DCI.DAG;
15570 EVT VT =
N->getValueType(0);
15576 if (Subtarget->hasMad64_32()) {
15577 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
15582 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
15586 if (VT == MVT::i64) {
15587 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15592 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15594 std::optional<bool> IsSigned;
15600 int ChainLength = 0;
15601 for (
int I = 0;
I < 4;
I++) {
15605 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15608 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15613 TempNode->getOperand(MulIdx), *Src0, *Src1,
15614 TempNode->getOperand(MulIdx)->getOperand(0),
15615 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15619 IsSigned = *IterIsSigned;
15620 if (*IterIsSigned != *IsSigned)
15623 auto AddIdx = 1 - MulIdx;
15626 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
15627 Src2s.
push_back(TempNode->getOperand(AddIdx));
15637 TempNode->getOperand(AddIdx), *Src0, *Src1,
15638 TempNode->getOperand(AddIdx)->getOperand(0),
15639 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15643 if (*IterIsSigned != *IsSigned)
15647 ChainLength =
I + 2;
15651 TempNode = TempNode->getOperand(AddIdx);
15653 ChainLength =
I + 1;
15654 if (TempNode->getNumOperands() < 2)
15656 LHS = TempNode->getOperand(0);
15657 RHS = TempNode->getOperand(1);
15660 if (ChainLength < 2)
15666 if (ChainLength < 4) {
15676 bool UseOriginalSrc =
false;
15677 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
15678 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
15679 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
15680 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
15681 SmallVector<unsigned, 4> SrcBytes;
15682 auto Src0Mask = Src0s.
begin()->PermMask;
15683 SrcBytes.
push_back(Src0Mask & 0xFF000000);
15684 bool UniqueEntries =
true;
15685 for (
auto I = 1;
I < 4;
I++) {
15686 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
15689 UniqueEntries =
false;
15695 if (UniqueEntries) {
15696 UseOriginalSrc =
true;
15698 auto *FirstElt = Src0s.
begin();
15702 auto *SecondElt = Src1s.
begin();
15704 SecondElt->DWordOffset);
15713 if (!UseOriginalSrc) {
15720 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
15723 : Intrinsic::amdgcn_udot4,
15733 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
15738 unsigned Opc =
LHS.getOpcode();
15750 auto Cond =
RHS.getOperand(0);
15755 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
15772 DAGCombinerInfo &DCI)
const {
15773 SelectionDAG &DAG = DCI.DAG;
15775 EVT VT =
N->getValueType(0);
15788 SDNodeFlags ShlFlags = N1->
getFlags();
15792 SDNodeFlags NewShlFlags =
15797 DCI.AddToWorklist(Inner.
getNode());
15804 if (Subtarget->hasMad64_32()) {
15805 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
15814 if (VT == MVT::i64) {
15815 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15823 if (
const GlobalAddressSDNode *GA =
15828 SDNodeFlags
Flags =
15831 DCI.AddToWorklist(Inner.
getNode());
15859 SDNodeFlags ReassocFlags =
15862 if (ZIsConstant != YIsConstant) {
15866 DCI.AddToWorklist(Inner.
getNode());
15874 assert(!YIsConstant && !ZIsConstant);
15876 if (!
X->isDivergent() &&
Y->isDivergent() !=
Z->isDivergent()) {
15885 if (
Y->isDivergent())
15888 DCI.AddToWorklist(UniformInner.
getNode());
15896 DAGCombinerInfo &DCI)
const {
15897 SelectionDAG &DAG = DCI.DAG;
15898 EVT VT =
N->getValueType(0);
15900 if (VT == MVT::i64) {
15901 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15905 if (VT != MVT::i32)
15914 unsigned Opc =
RHS.getOpcode();
15921 auto Cond =
RHS.getOperand(0);
15926 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
15944SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
15945 DAGCombinerInfo &DCI)
const {
15947 if (
N->getValueType(0) != MVT::i32)
15953 SelectionDAG &DAG = DCI.DAG;
15958 unsigned LHSOpc =
LHS.getOpcode();
15959 unsigned Opc =
N->getOpcode();
15963 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
15969 DAGCombinerInfo &DCI)
const {
15973 SelectionDAG &DAG = DCI.DAG;
15974 EVT VT =
N->getValueType(0);
15986 if (
A ==
LHS.getOperand(1)) {
15987 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
15988 if (FusedOp != 0) {
15990 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
15998 if (
A ==
RHS.getOperand(1)) {
15999 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16000 if (FusedOp != 0) {
16002 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
16011 DAGCombinerInfo &DCI)
const {
16015 SelectionDAG &DAG = DCI.DAG;
16017 EVT VT =
N->getValueType(0);
16030 if (
A ==
LHS.getOperand(1)) {
16031 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16032 if (FusedOp != 0) {
16036 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
16045 if (
A ==
RHS.getOperand(1)) {
16046 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16047 if (FusedOp != 0) {
16049 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
16058 DAGCombinerInfo &DCI)
const {
16059 SelectionDAG &DAG = DCI.DAG;
16061 EVT VT =
N->getValueType(0);
16062 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16068 SDNodeFlags
Flags =
N->getFlags();
16069 SDNodeFlags RHSFlags =
RHS->getFlags();
16075 bool IsNegative =
false;
16076 if (CLHS->isExactlyValue(1.0) ||
16077 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16080 if (
RHS.getOpcode() == ISD::FSQRT) {
16084 return IsNegative ? DAG.
getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16093 DAGCombinerInfo &DCI)
const {
16094 SelectionDAG &DAG = DCI.DAG;
16095 EVT VT =
N->getValueType(0);
16099 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
16100 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16115 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16120 const ConstantFPSDNode *FalseNode =
16130 if (ScalarVT == MVT::f32 &&
16136 if (TrueNodeExpVal == INT_MIN)
16139 if (FalseNodeExpVal == INT_MIN)
16152 return DAG.
getNode(ISD::FLDEXP, SL, VT,
LHS, SelectNode,
N->getFlags());
16159 DAGCombinerInfo &DCI)
const {
16160 SelectionDAG &DAG = DCI.DAG;
16161 EVT VT =
N->getValueType(0);
16164 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16182 (
N->getFlags().hasAllowContract() &&
16183 FMA->getFlags().hasAllowContract())) {
16198 if (FMAOp1.
getOpcode() != ISD::FP_EXTEND ||
16217 if (Vec1 == Vec2 || Vec3 == Vec4)
16223 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16232 DAGCombinerInfo &DCI)
const {
16233 SelectionDAG &DAG = DCI.DAG;
16238 EVT VT =
LHS.getValueType();
16267 return LHS.getOperand(0);
16275 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
16282 const APInt &CT =
LHS.getConstantOperandAPInt(1);
16283 const APInt &CF =
LHS.getConstantOperandAPInt(2);
16291 return LHS.getOperand(0);
16295 if (VT != MVT::f32 && VT != MVT::f64 &&
16296 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16304 LHS.getOpcode() == ISD::FABS) {
16311 const unsigned IsInfMask =
16313 const unsigned IsFiniteMask =
16327SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
16328 DAGCombinerInfo &DCI)
const {
16329 SelectionDAG &DAG = DCI.DAG;
16350 unsigned ShiftOffset = 8 *
Offset;
16352 ShiftOffset -=
C->getZExtValue();
16354 ShiftOffset +=
C->getZExtValue();
16356 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16358 MVT::f32, Shifted);
16369 DCI.AddToWorklist(
N);
16376 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
16382 DAGCombinerInfo &DCI)
const {
16387 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16391 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16392 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
16395 APFloat One(
F.getSemantics(),
"1.0");
16397 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
16403 DAGCombinerInfo &DCI)
const {
16424 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
16425 bool isInteger =
LHS.getValueType().isInteger();
16428 if (!isFloatingPoint && !isInteger)
16433 if (!isEquality && !isNonEquality)
16450 if (isFloatingPoint) {
16452 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16463 if (!(isEquality && TrueVal == ConstVal) &&
16464 !(isNonEquality && FalseVal == ConstVal))
16471 SelectLHS, SelectRHS);
16476 switch (
N->getOpcode()) {
16492 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
16502 switch (
N->getOpcode()) {
16504 return performAddCombine(
N, DCI);
16506 return performPtrAddCombine(
N, DCI);
16508 return performSubCombine(
N, DCI);
16511 return performAddCarrySubCarryCombine(
N, DCI);
16513 return performFAddCombine(
N, DCI);
16515 return performFSubCombine(
N, DCI);
16517 return performFDivCombine(
N, DCI);
16519 return performFMulCombine(
N, DCI);
16521 return performSetCCCombine(
N, DCI);
16523 if (
auto Res = performSelectCombine(
N, DCI))
16528 case ISD::FMAXNUM_IEEE:
16529 case ISD::FMINNUM_IEEE:
16530 case ISD::FMAXIMUM:
16531 case ISD::FMINIMUM:
16532 case ISD::FMAXIMUMNUM:
16533 case ISD::FMINIMUMNUM:
16540 return performMinMaxCombine(
N, DCI);
16542 return performFMACombine(
N, DCI);
16544 return performAndCombine(
N, DCI);
16546 return performOrCombine(
N, DCI);
16549 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
16550 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16556 return performXorCombine(
N, DCI);
16558 return performZeroExtendCombine(
N, DCI);
16560 return performSignExtendInRegCombine(
N, DCI);
16562 return performClassCombine(
N, DCI);
16564 return performFCanonicalizeCombine(
N, DCI);
16566 return performRcpCombine(
N, DCI);
16581 return performUCharToFloatCombine(
N, DCI);
16583 return performFCopySignCombine(
N, DCI);
16588 return performCvtF32UByteNCombine(
N, DCI);
16590 return performFMed3Combine(
N, DCI);
16592 return performCvtPkRTZCombine(
N, DCI);
16594 return performClampCombine(
N, DCI);
16597 EVT VT =
N->getValueType(0);
16600 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16603 EVT EltVT = Src.getValueType();
16604 if (EltVT != MVT::i16)
16605 Src = DAG.
getNode(ISD::BITCAST, SL, MVT::i16, Src);
16608 return DAG.
getNode(ISD::BITCAST, SL, VT, Ext);
16614 return performExtractVectorEltCombine(
N, DCI);
16616 return performInsertVectorEltCombine(
N, DCI);
16618 return performFPRoundCombine(
N, DCI);
16627 return performMemSDNodeCombine(MemNode, DCI);
16658 unsigned Opcode =
Node->getMachineOpcode();
16661 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16662 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
16665 SDNode *
Users[5] = {
nullptr};
16667 unsigned DmaskIdx =
16668 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16669 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
16670 unsigned NewDmask = 0;
16671 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16672 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16673 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
16674 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
16675 unsigned TFCLane = 0;
16676 bool HasChain =
Node->getNumValues() > 1;
16678 if (OldDmask == 0) {
16686 TFCLane = OldBitsSet;
16690 for (SDUse &Use :
Node->uses()) {
16693 if (
Use.getResNo() != 0)
16696 SDNode *
User =
Use.getUser();
16699 if (!
User->isMachineOpcode() ||
16700 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
16712 if (UsesTFC && Lane == TFCLane) {
16717 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
16719 Dmask &= ~(1 << Comp);
16727 NewDmask |= 1 << Comp;
16732 bool NoChannels = !NewDmask;
16739 if (OldBitsSet == 1)
16745 if (NewDmask == OldDmask)
16754 unsigned NewChannels = BitsSet + UsesTFC;
16758 assert(NewOpcode != -1 &&
16759 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
16760 "failed to find equivalent MIMG op");
16768 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
16770 MVT ResultVT = NewChannels == 1
16773 : NewChannels == 5 ? 8
16775 SDVTList NewVTList =
16778 MachineSDNode *NewNode =
16787 if (NewChannels == 1) {
16797 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
16802 if (i || !NoChannels)
16807 if (NewUser != User) {
16817 Idx = AMDGPU::sub1;
16820 Idx = AMDGPU::sub2;
16823 Idx = AMDGPU::sub3;
16826 Idx = AMDGPU::sub4;
16837 Op =
Op.getOperand(0);
16858 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
16862 Node->getOperand(0), SL, VReg, SrcVal,
16868 return ToResultReg.
getNode();
16873 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
16875 Ops.push_back(
Node->getOperand(i));
16881 Node->getOperand(i).getValueType(),
16882 Node->getOperand(i)),
16894 unsigned Opcode =
Node->getMachineOpcode();
16896 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
16897 !
TII->isGather4(Opcode) &&
16899 return adjustWritemask(
Node, DAG);
16902 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
16908 case AMDGPU::V_DIV_SCALE_F32_e64:
16909 case AMDGPU::V_DIV_SCALE_F64_e64: {
16919 (Src0 == Src1 || Src0 == Src2))
16975 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
16976 unsigned InitIdx = 0;
16978 if (
TII->isImage(
MI)) {
16986 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
16987 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
16988 unsigned D16Val = D16 ? D16->getImm() : 0;
16990 if (!TFEVal && !LWEVal)
17001 assert(MO_Dmask &&
"Expected dmask operand in instruction");
17003 unsigned dmask = MO_Dmask->
getImm();
17008 bool Packed = !Subtarget->hasUnpackedD16VMem();
17010 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17016 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
17017 if (DstSize < InitIdx)
17020 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
17028 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
17029 unsigned NewDst = 0;
17034 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17035 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17038 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17039 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
17059 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
17072 if (
TII->isVOP3(
MI.getOpcode())) {
17074 TII->legalizeOperandsVOP3(
MRI,
MI);
17079 if (!
MI.getDesc().operands().empty()) {
17080 unsigned Opc =
MI.getOpcode();
17081 bool HasAGPRs = Info->mayNeedAGPRs();
17083 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
17085 {AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src0),
17086 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1), Src2Idx}) {
17089 if ((
I == Src2Idx) && (HasAGPRs))
17092 if (!
Op.isReg() || !
Op.getReg().isVirtual())
17094 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
17095 if (!
TRI->hasAGPRs(RC))
17097 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
17098 if (!Src || !Src->isCopy() ||
17099 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
17101 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
17105 MRI.setRegClass(
Op.getReg(), NewRC);
17108 if (
TII->isMAI(
MI)) {
17113 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17114 AMDGPU::OpName::scale_src0);
17115 if (Src0Idx != -1) {
17116 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17117 AMDGPU::OpName::scale_src1);
17118 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
17119 TII->usesConstantBus(
MRI,
MI, Src1Idx))
17120 TII->legalizeOpWithMove(
MI, Src1Idx);
17128 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
17129 if (Src2->isReg() && Src2->getReg().isVirtual()) {
17130 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
17131 if (
TRI->isVectorSuperClass(RC)) {
17132 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
17133 MRI.setRegClass(Src2->getReg(), NewRC);
17134 if (Src2->isTied())
17135 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
17144 if (
TII->isImage(
MI))
17145 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
17219std::pair<unsigned, const TargetRegisterClass *>
17226 if (Constraint.
size() == 1) {
17230 if (VT == MVT::Other)
17233 switch (Constraint[0]) {
17240 RC = &AMDGPU::SReg_32RegClass;
17243 RC = &AMDGPU::SGPR_64RegClass;
17248 return std::pair(0U,
nullptr);
17255 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17256 : &AMDGPU::VGPR_32_Lo256RegClass;
17259 RC = Subtarget->has1024AddressableVGPRs()
17260 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
17263 return std::pair(0U,
nullptr);
17268 if (!Subtarget->hasMAIInsts())
17272 RC = &AMDGPU::AGPR_32RegClass;
17277 return std::pair(0U,
nullptr);
17282 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
17286 RC = &AMDGPU::AV_32RegClass;
17289 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
17291 return std::pair(0U,
nullptr);
17300 return std::pair(0U, RC);
17303 if (Kind !=
'\0') {
17305 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17306 }
else if (Kind ==
's') {
17307 RC = &AMDGPU::SGPR_32RegClass;
17308 }
else if (Kind ==
'a') {
17309 RC = &AMDGPU::AGPR_32RegClass;
17315 return std::pair(0U,
nullptr);
17321 return std::pair(0U,
nullptr);
17325 RC =
TRI->getVGPRClassForBitWidth(Width);
17327 RC =
TRI->getSGPRClassForBitWidth(Width);
17329 RC =
TRI->getAGPRClassForBitWidth(Width);
17331 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17336 return std::pair(0U,
nullptr);
17338 return std::pair(Reg, RC);
17344 return std::pair(0U,
nullptr);
17345 if (Idx < RC->getNumRegs())
17347 return std::pair(0U,
nullptr);
17353 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
17359 if (Constraint.
size() == 1) {
17360 switch (Constraint[0]) {
17370 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
17378 if (Constraint.
size() == 1) {
17379 switch (Constraint[0]) {
17387 }
else if (Constraint.
size() == 2) {
17388 if (Constraint ==
"VA")
17406 std::vector<SDValue> &
Ops,
17421 unsigned Size =
Op.getScalarValueSizeInBits();
17425 if (
Size == 16 && !Subtarget->has16BitInsts())
17429 Val =
C->getSExtValue();
17433 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17437 if (
Size != 16 ||
Op.getNumOperands() != 2)
17439 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
17442 Val =
C->getSExtValue();
17446 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17456 if (Constraint.
size() == 1) {
17457 switch (Constraint[0]) {
17472 }
else if (Constraint.
size() == 2) {
17473 if (Constraint ==
"DA") {
17474 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
17475 int64_t LoBits =
static_cast<int32_t
>(Val);
17479 if (Constraint ==
"DB") {
17487 unsigned MaxSize)
const {
17488 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
17489 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17491 MVT VT =
Op.getSimpleValueType();
17516 switch (UnalignedClassID) {
17517 case AMDGPU::VReg_64RegClassID:
17518 return AMDGPU::VReg_64_Align2RegClassID;
17519 case AMDGPU::VReg_96RegClassID:
17520 return AMDGPU::VReg_96_Align2RegClassID;
17521 case AMDGPU::VReg_128RegClassID:
17522 return AMDGPU::VReg_128_Align2RegClassID;
17523 case AMDGPU::VReg_160RegClassID:
17524 return AMDGPU::VReg_160_Align2RegClassID;
17525 case AMDGPU::VReg_192RegClassID:
17526 return AMDGPU::VReg_192_Align2RegClassID;
17527 case AMDGPU::VReg_224RegClassID:
17528 return AMDGPU::VReg_224_Align2RegClassID;
17529 case AMDGPU::VReg_256RegClassID:
17530 return AMDGPU::VReg_256_Align2RegClassID;
17531 case AMDGPU::VReg_288RegClassID:
17532 return AMDGPU::VReg_288_Align2RegClassID;
17533 case AMDGPU::VReg_320RegClassID:
17534 return AMDGPU::VReg_320_Align2RegClassID;
17535 case AMDGPU::VReg_352RegClassID:
17536 return AMDGPU::VReg_352_Align2RegClassID;
17537 case AMDGPU::VReg_384RegClassID:
17538 return AMDGPU::VReg_384_Align2RegClassID;
17539 case AMDGPU::VReg_512RegClassID:
17540 return AMDGPU::VReg_512_Align2RegClassID;
17541 case AMDGPU::VReg_1024RegClassID:
17542 return AMDGPU::VReg_1024_Align2RegClassID;
17543 case AMDGPU::AReg_64RegClassID:
17544 return AMDGPU::AReg_64_Align2RegClassID;
17545 case AMDGPU::AReg_96RegClassID:
17546 return AMDGPU::AReg_96_Align2RegClassID;
17547 case AMDGPU::AReg_128RegClassID:
17548 return AMDGPU::AReg_128_Align2RegClassID;
17549 case AMDGPU::AReg_160RegClassID:
17550 return AMDGPU::AReg_160_Align2RegClassID;
17551 case AMDGPU::AReg_192RegClassID:
17552 return AMDGPU::AReg_192_Align2RegClassID;
17553 case AMDGPU::AReg_256RegClassID:
17554 return AMDGPU::AReg_256_Align2RegClassID;
17555 case AMDGPU::AReg_512RegClassID:
17556 return AMDGPU::AReg_512_Align2RegClassID;
17557 case AMDGPU::AReg_1024RegClassID:
17558 return AMDGPU::AReg_1024_Align2RegClassID;
17574 if (Info->isEntryFunction()) {
17581 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17583 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17584 :
TRI->getAlignedHighSGPRForRC(MF, 2,
17585 &AMDGPU::SGPR_64RegClass);
17586 Info->setSGPRForEXECCopy(SReg);
17588 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
17589 Info->getStackPtrOffsetReg()));
17590 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17591 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17595 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17596 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17598 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17599 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17601 Info->limitOccupancy(MF);
17603 if (ST.isWave32() && !MF.
empty()) {
17604 for (
auto &
MBB : MF) {
17605 for (
auto &
MI :
MBB) {
17606 TII->fixImplicitOperands(
MI);
17616 if (ST.needsAlignedVGPRs()) {
17617 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
17623 if (NewClassID != -1)
17624 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
17633 const APInt &DemandedElts,
17635 unsigned Depth)
const {
17637 unsigned Opc =
Op.getOpcode();
17640 unsigned IID =
Op.getConstantOperandVal(0);
17642 case Intrinsic::amdgcn_mbcnt_lo:
17643 case Intrinsic::amdgcn_mbcnt_hi: {
17649 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17659 Op, Known, DemandedElts, DAG,
Depth);
17675 unsigned MaxValue =
17682 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
17686 unsigned Src1Cst = 0;
17687 if (Src1.
isImm()) {
17688 Src1Cst = Src1.
getImm();
17689 }
else if (Src1.
isReg()) {
17693 Src1Cst = Cst->Value.getZExtValue();
17704 if (Width >= BFEWidth)
17713 Known = Known.
sext(BFEWidth);
17715 Known = Known.
zext(BFEWidth);
17721 unsigned Depth)
const {
17724 switch (
MI->getOpcode()) {
17725 case AMDGPU::S_BFE_I32:
17728 case AMDGPU::S_BFE_U32:
17731 case AMDGPU::S_BFE_I64:
17734 case AMDGPU::S_BFE_U64:
17737 case AMDGPU::G_INTRINSIC:
17738 case AMDGPU::G_INTRINSIC_CONVERGENT: {
17741 case Intrinsic::amdgcn_workitem_id_x:
17744 case Intrinsic::amdgcn_workitem_id_y:
17747 case Intrinsic::amdgcn_workitem_id_z:
17750 case Intrinsic::amdgcn_mbcnt_lo:
17751 case Intrinsic::amdgcn_mbcnt_hi: {
17763 case Intrinsic::amdgcn_groupstaticsize: {
17774 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
17777 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
17780 case AMDGPU::G_AMDGPU_SMED3:
17781 case AMDGPU::G_AMDGPU_UMED3: {
17782 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
17809 unsigned Depth)
const {
17816 AttributeList Attrs =
17818 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
17845 if (Header->getAlignment() != PrefAlign)
17846 return Header->getAlignment();
17848 unsigned LoopSize = 0;
17853 LoopSize +=
MBB->getAlignment().value() / 2;
17856 LoopSize +=
TII->getInstSizeInBytes(
MI);
17857 if (LoopSize > 192)
17862 if (LoopSize <= 64)
17865 if (LoopSize <= 128)
17866 return CacheLineAlign;
17872 auto I = Exit->getFirstNonDebugInstr();
17873 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
17874 return CacheLineAlign;
17883 if (PreTerm == Pre->
begin() ||
17884 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
17888 auto ExitHead = Exit->getFirstNonDebugInstr();
17889 if (ExitHead == Exit->end() ||
17890 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
17895 return CacheLineAlign;
17903 N =
N->getOperand(0).getNode();
17904 if (
N->getOpcode() == ISD::INLINEASM ||
N->getOpcode() == ISD::INLINEASM_BR)
17913 switch (
N->getOpcode()) {
17921 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
17922 return !
TRI->isSGPRReg(
MRI, Reg);
17928 return !
TRI->isSGPRReg(
MRI, Reg);
17932 unsigned AS = L->getAddressSpace();
17936 case ISD::CALLSEQ_END:
17965 return A->readMem() &&
A->writeMem();
17986 switch (Ty.getScalarSizeInBits()) {
17998 const APInt &DemandedElts,
18001 unsigned Depth)
const {
18006 if (Info->getMode().DX10Clamp)
18018 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
18038 <<
"Hardware instruction generated for atomic "
18040 <<
" operation at memory scope " << MemScope;
18045 Type *EltTy = VT->getElementType();
18046 return VT->getNumElements() == 2 &&
18066 unsigned BW =
IT->getBitWidth();
18067 return BW == 32 || BW == 64;
18081 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
18082 return BW == 32 || BW == 64;
18085 if (Ty->isFloatTy() || Ty->isDoubleTy())
18089 return VT->getNumElements() == 2 &&
18090 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18100 bool HasSystemScope) {
18107 if (HasSystemScope) {
18116 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
18129 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
18155 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
18168 bool HasSystemScope =
18194 if (Subtarget->hasEmulatedSystemScopeAtomics())
18210 if (!HasSystemScope &&
18211 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18223 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
18231 ConstVal && ConstVal->isNullValue())
18269 if (Ty->isFloatTy()) {
18274 if (Ty->isDoubleTy()) {
18295 if (Ty->isFloatTy() &&
18296 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18309 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18313 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
18317 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18322 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
18327 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18331 if (Ty->isFloatTy()) {
18334 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18337 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18342 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18350 if (Subtarget->hasFlatAtomicFaddF32Inst())
18359 if (Subtarget->hasLDSFPAtomicAddF32()) {
18360 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18362 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18390 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18392 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18396 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18398 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18451 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18452 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18453 : &AMDGPU::SReg_32RegClass;
18454 if (!
TRI->isSGPRClass(RC) && !isDivergent)
18455 return TRI->getEquivalentSGPRClass(RC);
18456 if (
TRI->isSGPRClass(RC) && isDivergent)
18457 return TRI->getEquivalentVGPRClass(RC);
18469 unsigned WaveSize) {
18474 if (!
IT ||
IT->getBitWidth() != WaveSize)
18479 if (!Visited.
insert(V).second)
18481 bool Result =
false;
18482 for (
const auto *U : V->users()) {
18484 if (V == U->getOperand(1)) {
18489 case Intrinsic::amdgcn_if_break:
18490 case Intrinsic::amdgcn_if:
18491 case Intrinsic::amdgcn_else:
18496 if (V == U->getOperand(0)) {
18501 case Intrinsic::amdgcn_end_cf:
18502 case Intrinsic::amdgcn_loop:
18508 Result =
hasCFUser(U, Visited, WaveSize);
18517 const Value *V)
const {
18519 if (CI->isInlineAsm()) {
18528 for (
auto &TC : TargetConstraints) {
18542 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18570 return MRI.hasOneNonDBGUse(N0);
18577 if (
I.getMetadata(
"amdgpu.noclobber"))
18579 if (
I.getMetadata(
"amdgpu.last.use"))
18589 if (!Def->isMachineOpcode())
18599 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18600 PhysReg = AMDGPU::SCC;
18602 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18668 Alignment = RMW->getAlign();
18681 bool FullFlatEmulation =
18683 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18684 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18685 RMW->getType()->isDoubleTy()));
18688 bool ReturnValueIsUsed = !AI->
use_empty();
18697 if (FullFlatEmulation) {
18708 std::prev(BB->
end())->eraseFromParent();
18709 Builder.SetInsertPoint(BB);
18711 Value *LoadedShared =
nullptr;
18712 if (FullFlatEmulation) {
18713 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
18714 {Addr},
nullptr,
"is.shared");
18715 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
18716 Builder.SetInsertPoint(SharedBB);
18717 Value *CastToLocal = Builder.CreateAddrSpaceCast(
18723 LoadedShared = Clone;
18725 Builder.CreateBr(PhiBB);
18726 Builder.SetInsertPoint(CheckPrivateBB);
18729 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
18730 {Addr},
nullptr,
"is.private");
18731 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
18733 Builder.SetInsertPoint(PrivateBB);
18735 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
18738 Value *LoadedPrivate;
18740 LoadedPrivate = Builder.CreateAlignedLoad(
18741 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
18744 LoadedPrivate, RMW->getValOperand());
18746 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
18748 auto [ResultLoad, Equal] =
18754 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
18757 Builder.CreateBr(PhiBB);
18759 Builder.SetInsertPoint(GlobalBB);
18763 if (FullFlatEmulation) {
18764 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
18773 if (!FullFlatEmulation) {
18778 MDNode *RangeNotPrivate =
18781 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
18785 Builder.CreateBr(PhiBB);
18787 Builder.SetInsertPoint(PhiBB);
18789 if (ReturnValueIsUsed) {
18792 if (FullFlatEmulation)
18799 Builder.CreateBr(ExitBB);
18803 unsigned PtrOpIdx) {
18804 Value *PtrOp =
I->getOperand(PtrOpIdx);
18811 I->setOperand(PtrOpIdx, ASCast);
18823 ConstVal && ConstVal->isNullValue()) {
18853 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
18861 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
18876 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ TC_RETURN_GFX_WholeWave
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ SMULO
Same for multiplication.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const