40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
52#define DEBUG_TYPE "si-lower"
58 cl::desc(
"Do not align and prefetch loops"),
62 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
63 cl::desc(
"Use indirect register addressing for divergent indexes"),
70 cl::desc(
"Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
85 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
86 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
88 return AMDGPU::SGPR0 +
Reg;
160 if (Subtarget->has16BitInsts()) {
161 if (Subtarget->useRealTrue16Insts()) {
203 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
204 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
205 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
206 MVT::i1, MVT::v32i32},
210 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
211 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
212 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
213 MVT::i1, MVT::v32i32},
220 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
221 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
222 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
223 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
224 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
282 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
289 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
290 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
291 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
294 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
295 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
296 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
300 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
301 MVT::v3i16, MVT::v4i16, MVT::Other},
306 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
322 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
323 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
324 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
325 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
326 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
327 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
328 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
329 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
361 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
375 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
389 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
403 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
417 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
432 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
433 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
436 if (Subtarget->hasPkMovB32()) {
457 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
458 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
463 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
467 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
468 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
469 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
470 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
494 if (Subtarget->hasSMemRealTime() ||
499 if (Subtarget->has16BitInsts()) {
506 if (Subtarget->hasMadMacF32Insts())
509 if (!Subtarget->hasBFI())
513 if (!Subtarget->hasBCNT(32))
516 if (!Subtarget->hasBCNT(64))
519 if (Subtarget->hasFFBH())
522 if (Subtarget->hasFFBL())
533 if (Subtarget->hasBFE())
537 if (Subtarget->hasIntClamp())
540 if (Subtarget->hasAddNoCarry())
545 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
546 {MVT::f32, MVT::f64},
Custom);
552 {MVT::f32, MVT::f64},
Legal);
554 if (Subtarget->haveRoundOpsF64())
577 if (Subtarget->has16BitInsts()) {
626 ISD::FSIN, ISD::FROUND},
630 if (Subtarget->hasBF16TransInsts())
649 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
650 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
651 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
784 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
785 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
786 MVT::v32f16, MVT::v32bf16},
790 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
796 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
800 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
804 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
805 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
813 if (Subtarget->hasVOP3PInsts()) {
824 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
827 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
828 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
829 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
832 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
840 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
846 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
847 {MVT::v2f16, MVT::v4f16},
Custom);
853 if (Subtarget->hasPackedFP32Ops()) {
857 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
864 if (Subtarget->has16BitInsts()) {
877 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
878 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
879 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
880 MVT::v32f16, MVT::v32bf16},
885 if (Subtarget->hasVectorMulU64())
887 else if (Subtarget->hasScalarSMulU64())
890 if (Subtarget->hasMad64_32())
893 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
896 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
898 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
901 if (Subtarget->hasMinimum3Maximum3F32())
904 if (Subtarget->hasMinimum3Maximum3PKF16()) {
908 if (!Subtarget->hasMinimum3Maximum3F16())
913 if (Subtarget->hasVOP3PInsts()) {
916 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
920 if (Subtarget->hasIntMinMax64())
925 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
926 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
931 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
932 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
933 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
934 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
938 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
939 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
940 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
941 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
956 if (Subtarget->hasBF16ConversionInsts()) {
961 if (Subtarget->hasBF16PackedInsts()) {
967 if (Subtarget->hasBF16TransInsts()) {
971 if (Subtarget->hasCvtPkF16F32Inst()) {
973 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1023 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1032 ISD::ATOMIC_CMP_SWAP,
1033 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1035 ISD::ATOMIC_LOAD_ADD,
1036 ISD::ATOMIC_LOAD_SUB,
1037 ISD::ATOMIC_LOAD_AND,
1038 ISD::ATOMIC_LOAD_OR,
1039 ISD::ATOMIC_LOAD_XOR,
1040 ISD::ATOMIC_LOAD_NAND,
1041 ISD::ATOMIC_LOAD_MIN,
1042 ISD::ATOMIC_LOAD_MAX,
1043 ISD::ATOMIC_LOAD_UMIN,
1044 ISD::ATOMIC_LOAD_UMAX,
1045 ISD::ATOMIC_LOAD_FADD,
1046 ISD::ATOMIC_LOAD_FMIN,
1047 ISD::ATOMIC_LOAD_FMAX,
1048 ISD::ATOMIC_LOAD_UINC_WRAP,
1049 ISD::ATOMIC_LOAD_UDEC_WRAP,
1062 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1075 EVT DestVT,
EVT SrcVT)
const {
1077 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1078 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1080 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1087 LLT DestTy,
LLT SrcTy)
const {
1088 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1089 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1091 SrcTy.getScalarSizeInBits() == 16 &&
1112 if (Subtarget->has16BitInsts()) {
1115 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1117 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1121 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1143 if (
Size == 16 && Subtarget->has16BitInsts())
1144 return (NumElts + 1) / 2;
1150 return NumElts * ((
Size + 31) / 32);
1159 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1167 if (
Size == 16 && Subtarget->has16BitInsts()) {
1168 if (ScalarVT == MVT::bf16) {
1169 RegisterVT = MVT::i32;
1170 IntermediateVT = MVT::v2bf16;
1172 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1173 IntermediateVT = RegisterVT;
1175 NumIntermediates = (NumElts + 1) / 2;
1176 return NumIntermediates;
1181 IntermediateVT = RegisterVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1188 RegisterVT = MVT::i16;
1189 IntermediateVT = ScalarVT;
1190 NumIntermediates = NumElts;
1191 return NumIntermediates;
1195 RegisterVT = MVT::i32;
1196 IntermediateVT = ScalarVT;
1197 NumIntermediates = NumElts;
1198 return NumIntermediates;
1202 RegisterVT = MVT::i32;
1203 IntermediateVT = RegisterVT;
1204 NumIntermediates = NumElts * ((
Size + 31) / 32);
1205 return NumIntermediates;
1210 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1215 unsigned MaxNumLanes) {
1216 assert(MaxNumLanes != 0);
1220 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1231 unsigned MaxNumLanes) {
1237 assert(ST->getNumContainedTypes() == 2 &&
1238 ST->getContainedType(1)->isIntegerTy(32));
1252 return MVT::amdgpuBufferFatPointer;
1254 DL.getPointerSizeInBits(AS) == 192)
1255 return MVT::amdgpuBufferStridedPointer;
1264 DL.getPointerSizeInBits(AS) == 160) ||
1266 DL.getPointerSizeInBits(AS) == 192))
1273 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1274 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1275 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1277 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1278 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1279 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1280 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1281 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1283 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1284 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1285 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1286 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1287 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1289 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1290 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1291 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1292 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1293 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1332 unsigned IntrID)
const {
1334 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1352 if (RsrcIntr->IsImage) {
1367 Info.ptrVal = RsrcArg;
1370 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1379 if (RsrcIntr->IsImage) {
1380 unsigned MaxNumLanes = 4;
1395 std::numeric_limits<unsigned>::max());
1405 if (RsrcIntr->IsImage) {
1426 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1428 Info.memVT = MVT::i32;
1435 case Intrinsic::amdgcn_raw_buffer_load_lds:
1436 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1437 case Intrinsic::amdgcn_struct_buffer_load_lds:
1438 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1444 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1445 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1446 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1447 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1450 std::numeric_limits<unsigned>::max());
1460 case Intrinsic::amdgcn_ds_ordered_add:
1461 case Intrinsic::amdgcn_ds_ordered_swap: {
1474 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1475 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1478 Info.ptrVal =
nullptr;
1483 case Intrinsic::amdgcn_ds_append:
1484 case Intrinsic::amdgcn_ds_consume: {
1497 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1498 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1499 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1504 Info.memVT = MVT::i64;
1510 case Intrinsic::amdgcn_global_atomic_csub: {
1519 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1520 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1521 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1524 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1527 ->getElementType(0));
1535 case Intrinsic::amdgcn_global_atomic_fmin_num:
1536 case Intrinsic::amdgcn_global_atomic_fmax_num:
1537 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1538 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1539 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1540 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1550 case Intrinsic::amdgcn_flat_load_monitor_b32:
1551 case Intrinsic::amdgcn_flat_load_monitor_b64:
1552 case Intrinsic::amdgcn_flat_load_monitor_b128:
1553 case Intrinsic::amdgcn_global_load_monitor_b32:
1554 case Intrinsic::amdgcn_global_load_monitor_b64:
1555 case Intrinsic::amdgcn_global_load_monitor_b128:
1556 case Intrinsic::amdgcn_cluster_load_b32:
1557 case Intrinsic::amdgcn_cluster_load_b64:
1558 case Intrinsic::amdgcn_cluster_load_b128:
1559 case Intrinsic::amdgcn_ds_load_tr6_b96:
1560 case Intrinsic::amdgcn_ds_load_tr4_b64:
1561 case Intrinsic::amdgcn_ds_load_tr8_b64:
1562 case Intrinsic::amdgcn_ds_load_tr16_b128:
1563 case Intrinsic::amdgcn_global_load_tr6_b96:
1564 case Intrinsic::amdgcn_global_load_tr4_b64:
1565 case Intrinsic::amdgcn_global_load_tr_b64:
1566 case Intrinsic::amdgcn_global_load_tr_b128:
1567 case Intrinsic::amdgcn_ds_read_tr4_b64:
1568 case Intrinsic::amdgcn_ds_read_tr6_b96:
1569 case Intrinsic::amdgcn_ds_read_tr8_b64:
1570 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1578 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1579 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1580 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1588 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1589 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1590 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1598 case Intrinsic::amdgcn_ds_gws_init:
1599 case Intrinsic::amdgcn_ds_gws_barrier:
1600 case Intrinsic::amdgcn_ds_gws_sema_v:
1601 case Intrinsic::amdgcn_ds_gws_sema_br:
1602 case Intrinsic::amdgcn_ds_gws_sema_p:
1603 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1613 Info.memVT = MVT::i32;
1615 Info.align =
Align(4);
1617 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1623 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1624 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1625 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1626 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1627 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1628 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1629 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1630 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1637 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1638 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1639 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1640 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1647 case Intrinsic::amdgcn_load_to_lds:
1648 case Intrinsic::amdgcn_global_load_lds: {
1656 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1657 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1658 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1659 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1669 Info.memVT = MVT::i32;
1671 Info.align =
Align(4);
1676 case Intrinsic::amdgcn_s_prefetch_data:
1677 case Intrinsic::amdgcn_flat_prefetch:
1678 case Intrinsic::amdgcn_global_prefetch: {
1693 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1696 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1697 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1709 Type *&AccessTy)
const {
1711 switch (
II->getIntrinsicID()) {
1712 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1713 case Intrinsic::amdgcn_cluster_load_b128:
1714 case Intrinsic::amdgcn_cluster_load_b64:
1715 case Intrinsic::amdgcn_cluster_load_b32:
1716 case Intrinsic::amdgcn_ds_append:
1717 case Intrinsic::amdgcn_ds_consume:
1718 case Intrinsic::amdgcn_ds_load_tr8_b64:
1719 case Intrinsic::amdgcn_ds_load_tr16_b128:
1720 case Intrinsic::amdgcn_ds_load_tr4_b64:
1721 case Intrinsic::amdgcn_ds_load_tr6_b96:
1722 case Intrinsic::amdgcn_ds_read_tr4_b64:
1723 case Intrinsic::amdgcn_ds_read_tr6_b96:
1724 case Intrinsic::amdgcn_ds_read_tr8_b64:
1725 case Intrinsic::amdgcn_ds_read_tr16_b64:
1726 case Intrinsic::amdgcn_ds_ordered_add:
1727 case Intrinsic::amdgcn_ds_ordered_swap:
1728 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1729 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1730 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1731 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1732 case Intrinsic::amdgcn_flat_load_monitor_b128:
1733 case Intrinsic::amdgcn_flat_load_monitor_b32:
1734 case Intrinsic::amdgcn_flat_load_monitor_b64:
1735 case Intrinsic::amdgcn_global_atomic_csub:
1736 case Intrinsic::amdgcn_global_atomic_fmax_num:
1737 case Intrinsic::amdgcn_global_atomic_fmin_num:
1738 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1739 case Intrinsic::amdgcn_global_load_monitor_b128:
1740 case Intrinsic::amdgcn_global_load_monitor_b32:
1741 case Intrinsic::amdgcn_global_load_monitor_b64:
1742 case Intrinsic::amdgcn_global_load_tr_b64:
1743 case Intrinsic::amdgcn_global_load_tr_b128:
1744 case Intrinsic::amdgcn_global_load_tr4_b64:
1745 case Intrinsic::amdgcn_global_load_tr6_b96:
1746 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1747 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1748 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1749 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1750 Ptr =
II->getArgOperand(0);
1752 case Intrinsic::amdgcn_load_to_lds:
1753 case Intrinsic::amdgcn_global_load_lds:
1754 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1755 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1756 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1757 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1758 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1759 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1760 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1761 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1762 Ptr =
II->getArgOperand(1);
1767 AccessTy =
II->getType();
1773 unsigned AddrSpace)
const {
1774 if (!Subtarget->hasFlatInstOffsets()) {
1785 return AM.
Scale == 0 &&
1786 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1787 AM.
BaseOffs, AddrSpace, FlatVariant));
1791 if (Subtarget->hasFlatGlobalInsts())
1794 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1807 return isLegalMUBUFAddressingMode(AM);
1810bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1821 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1833 if (AM.HasBaseReg) {
1865 return isLegalMUBUFAddressingMode(AM);
1867 if (!Subtarget->hasScalarSubwordLoads()) {
1872 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1920 return Subtarget->enableFlatScratch()
1922 : isLegalMUBUFAddressingMode(AM);
1969 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1978 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
1981 Align RequiredAlignment(
1983 if (Subtarget->hasLDSMisalignedBug() &&
Size > 32 &&
1984 Alignment < RequiredAlignment)
1999 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
2005 RequiredAlignment =
Align(4);
2007 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2023 *IsFast = (Alignment >= RequiredAlignment) ? 64
2024 : (Alignment <
Align(4)) ? 32
2031 if (!Subtarget->hasDS96AndDS128())
2037 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2046 *IsFast = (Alignment >= RequiredAlignment) ? 96
2047 : (Alignment <
Align(4)) ? 32
2054 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2060 RequiredAlignment =
Align(8);
2062 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2071 *IsFast = (Alignment >= RequiredAlignment) ? 128
2072 : (Alignment <
Align(4)) ? 32
2089 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2091 return Alignment >= RequiredAlignment ||
2092 Subtarget->hasUnalignedDSAccessEnabled();
2100 bool AlignedBy4 = Alignment >=
Align(4);
2101 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2103 *IsFast = AlignedBy4 ?
Size : 1;
2108 *IsFast = AlignedBy4;
2119 return Alignment >=
Align(4) ||
2120 Subtarget->hasUnalignedBufferAccessEnabled();
2132 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2147 return Size >= 32 && Alignment >=
Align(4);
2152 unsigned *IsFast)
const {
2154 Alignment, Flags, IsFast);
2159 const AttributeList &FuncAttributes)
const {
2165 if (
Op.size() >= 16 &&
2169 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2187 unsigned DestAS)
const {
2190 Subtarget->hasGloballyAddressableScratch()) {
2220 unsigned Index)
const {
2236 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2263 auto [InputPtrReg, RC, ArgTy] =
2273 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2279 const SDLoc &SL)
const {
2286 const SDLoc &SL)
const {
2289 std::optional<uint32_t> KnownSize =
2291 if (KnownSize.has_value())
2317 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2326SDValue SITargetLowering::lowerKernargMemParameter(
2338 int64_t OffsetDiff =
Offset - AlignDownOffset;
2344 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2353 ArgVal = DAG.
getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2354 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2364 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2373 const SDLoc &SL)
const {
2383 return DAG.
getNode(ISD::BITCAST, SL, ValVT, Val);
2442 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2445 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2446 if (ConvertedVal == ArgValue)
2447 return ConvertedVal;
2452SDValue SITargetLowering::lowerWorkGroupId(
2457 if (!Subtarget->hasClusters())
2458 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2466 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2467 SDLoc SL(ClusterIdXYZ);
2468 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2471 SDValue ClusterWorkGroupIdXYZ =
2472 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2482 return ClusterIdXYZ;
2484 using namespace AMDGPU::Hwreg;
2488 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2499SDValue SITargetLowering::getPreloadedValue(
2502 const ArgDescriptor *
Reg =
nullptr;
2503 const TargetRegisterClass *RC;
2507 const ArgDescriptor WorkGroupIDX =
2515 const ArgDescriptor WorkGroupIDZ =
2517 const ArgDescriptor ClusterWorkGroupIDX =
2519 const ArgDescriptor ClusterWorkGroupIDY =
2521 const ArgDescriptor ClusterWorkGroupIDZ =
2523 const ArgDescriptor ClusterWorkGroupMaxIDX =
2525 const ArgDescriptor ClusterWorkGroupMaxIDY =
2527 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2529 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2532 auto LoadConstant = [&](
unsigned N) {
2536 if (Subtarget->hasArchitectedSGPRs() &&
2543 Reg = &WorkGroupIDX;
2544 RC = &AMDGPU::SReg_32RegClass;
2548 Reg = &WorkGroupIDY;
2549 RC = &AMDGPU::SReg_32RegClass;
2553 Reg = &WorkGroupIDZ;
2554 RC = &AMDGPU::SReg_32RegClass;
2558 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2559 return LoadConstant(0);
2560 Reg = &ClusterWorkGroupIDX;
2561 RC = &AMDGPU::SReg_32RegClass;
2565 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2566 return LoadConstant(0);
2567 Reg = &ClusterWorkGroupIDY;
2568 RC = &AMDGPU::SReg_32RegClass;
2572 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2573 return LoadConstant(0);
2574 Reg = &ClusterWorkGroupIDZ;
2575 RC = &AMDGPU::SReg_32RegClass;
2580 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2581 Reg = &ClusterWorkGroupMaxIDX;
2582 RC = &AMDGPU::SReg_32RegClass;
2587 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2588 Reg = &ClusterWorkGroupMaxIDY;
2589 RC = &AMDGPU::SReg_32RegClass;
2594 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2595 Reg = &ClusterWorkGroupMaxIDZ;
2596 RC = &AMDGPU::SReg_32RegClass;
2600 Reg = &ClusterWorkGroupMaxFlatID;
2601 RC = &AMDGPU::SReg_32RegClass;
2632 for (
unsigned I = 0,
E = Ins.size(), PSInputNum = 0;
I !=
E; ++
I) {
2636 "vector type argument should have been split");
2641 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2649 "unexpected vector split in ps argument type");
2663 Info->markPSInputAllocated(PSInputNum);
2665 Info->markPSInputEnabled(PSInputNum);
2681 if (Info.hasWorkItemIDX()) {
2687 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2691 if (Info.hasWorkItemIDY()) {
2692 assert(Info.hasWorkItemIDX());
2693 if (Subtarget->hasPackedTID()) {
2694 Info.setWorkItemIDY(
2697 unsigned Reg = AMDGPU::VGPR1;
2705 if (Info.hasWorkItemIDZ()) {
2706 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2707 if (Subtarget->hasPackedTID()) {
2708 Info.setWorkItemIDZ(
2711 unsigned Reg = AMDGPU::VGPR2;
2731 if (RegIdx == ArgVGPRs.
size()) {
2738 unsigned Reg = ArgVGPRs[RegIdx];
2750 unsigned NumArgRegs) {
2753 if (RegIdx == ArgSGPRs.
size())
2756 unsigned Reg = ArgSGPRs[RegIdx];
2798 const unsigned Mask = 0x3ff;
2801 if (Info.hasWorkItemIDX()) {
2803 Info.setWorkItemIDX(Arg);
2806 if (Info.hasWorkItemIDY()) {
2808 Info.setWorkItemIDY(Arg);
2811 if (Info.hasWorkItemIDZ())
2823 const unsigned Mask = 0x3ff;
2832 auto &
ArgInfo = Info.getArgInfo();
2844 if (Info.hasImplicitArgPtr())
2852 if (Info.hasWorkGroupIDX())
2855 if (Info.hasWorkGroupIDY())
2858 if (Info.hasWorkGroupIDZ())
2861 if (Info.hasLDSKernelId())
2872 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2873 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2879 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2880 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2885 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2886 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2892 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2898 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
2907 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2912 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
2913 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2918 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
2919 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2934 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2936 bool InPreloadSequence =
true;
2938 bool AlignedForImplictArgs =
false;
2939 unsigned ImplicitArgOffset = 0;
2940 for (
auto &Arg :
F.args()) {
2941 if (!InPreloadSequence || !Arg.hasInRegAttr())
2944 unsigned ArgIdx = Arg.getArgNo();
2947 if (InIdx < Ins.size() &&
2948 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2951 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2952 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2954 assert(ArgLocs[ArgIdx].isMemLoc());
2955 auto &ArgLoc = ArgLocs[InIdx];
2957 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2959 unsigned NumAllocSGPRs =
2960 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2963 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2964 if (!AlignedForImplictArgs) {
2966 alignTo(LastExplicitArgOffset,
2967 Subtarget->getAlignmentForImplicitArgPtr()) -
2968 LastExplicitArgOffset;
2969 AlignedForImplictArgs =
true;
2971 ArgOffset += ImplicitArgOffset;
2975 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2976 assert(InIdx >= 1 &&
"No previous SGPR");
2977 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2978 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2982 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2983 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2986 InPreloadSequence =
false;
2992 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2994 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2996 if (PreloadRegs->
size() > 1)
2997 RC = &AMDGPU::SGPR_32RegClass;
2998 for (
auto &Reg : *PreloadRegs) {
3004 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3013 if (Info.hasLDSKernelId()) {
3014 Register Reg = Info.addLDSKernelId();
3015 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3024 bool IsShader)
const {
3025 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3026 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3032 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3034 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3038 unsigned NumRequiredSystemSGPRs =
3039 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3040 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3041 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3042 Register Reg = Info.addReservedUserSGPR();
3043 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3048 if (!HasArchitectedSGPRs) {
3049 if (Info.hasWorkGroupIDX()) {
3050 Register Reg = Info.addWorkGroupIDX();
3051 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3055 if (Info.hasWorkGroupIDY()) {
3056 Register Reg = Info.addWorkGroupIDY();
3057 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3061 if (Info.hasWorkGroupIDZ()) {
3062 Register Reg = Info.addWorkGroupIDZ();
3063 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3068 if (Info.hasWorkGroupInfo()) {
3069 Register Reg = Info.addWorkGroupInfo();
3070 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3074 if (Info.hasPrivateSegmentWaveByteOffset()) {
3076 unsigned PrivateSegmentWaveByteOffsetReg;
3079 PrivateSegmentWaveByteOffsetReg =
3080 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3084 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3086 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3089 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3091 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3092 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3095 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3096 Info.getNumPreloadedSGPRs() >= 16);
3111 if (HasStackObjects)
3112 Info.setHasNonSpillStackObjects(
true);
3117 HasStackObjects =
true;
3121 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3123 if (!ST.enableFlatScratch()) {
3124 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3131 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3133 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3143 Info.setScratchRSrcReg(ReservedBufferReg);
3162 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
3163 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3170 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3171 if (!
MRI.isLiveIn(
Reg)) {
3172 Info.setStackPtrOffsetReg(
Reg);
3177 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3184 if (ST.getFrameLowering()->hasFP(MF)) {
3185 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3201 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3210 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3211 RC = &AMDGPU::SGPR_64RegClass;
3212 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3213 RC = &AMDGPU::SGPR_32RegClass;
3219 Entry->addLiveIn(*
I);
3224 for (
auto *Exit : Exits)
3226 TII->get(TargetOpcode::COPY), *
I)
3241 bool IsError =
false;
3245 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3263 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3264 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3266 if (!Subtarget->enableFlatScratch())
3271 !Subtarget->hasArchitectedSGPRs())
3272 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3273 !Info->hasWorkGroupIDZ());
3276 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3294 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3295 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3298 Info->markPSInputAllocated(0);
3299 Info->markPSInputEnabled(0);
3301 if (Subtarget->isAmdPalOS()) {
3310 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3311 if ((PsInputBits & 0x7F) == 0 ||
3312 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3315 }
else if (IsKernel) {
3316 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3318 Splits.
append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3328 if (IsKernel && Subtarget->hasKernargPreload())
3332 }
else if (!IsGraphics) {
3337 if (!Subtarget->enableFlatScratch())
3349 Info->setNumWaveDispatchSGPRs(
3351 Info->setNumWaveDispatchVGPRs(
3353 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3354 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3359 if (IsWholeWaveFunc) {
3361 {MVT::i1, MVT::Other}, Chain);
3373 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3384 if (IsEntryFunc && VA.
isMemLoc()) {
3407 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3411 int64_t OffsetDiff =
Offset - AlignDownOffset;
3418 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3428 ArgVal = DAG.
getNode(ISD::BITCAST,
DL, MemVT, ArgVal);
3429 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3430 Ins[i].Flags.isSExt(), &Ins[i]);
3438 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3441 if (PreloadRegs.
size() == 1) {
3442 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3447 TRI->getRegSizeInBits(*RC)));
3455 for (
auto Reg : PreloadRegs) {
3462 PreloadRegs.size()),
3479 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3480 Ins[i].Flags.isSExt(), &Ins[i]);
3492 "hidden argument in kernel signature was not preloaded",
3498 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3499 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3519 if (!IsEntryFunc && VA.
isMemLoc()) {
3520 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3531 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3532 RC = &AMDGPU::VGPR_32RegClass;
3533 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3534 RC = &AMDGPU::SGPR_32RegClass;
3554 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3570 Info->setBytesInStackArgArea(StackArgSize);
3572 return Chains.
empty() ? Chain
3581 const Type *RetTy)
const {
3589 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3594 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3595 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3596 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3597 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3620 Info->setIfReturnsVoid(Outs.
empty());
3621 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3640 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3641 ++
I, ++RealRVLocIdx) {
3645 SDValue Arg = OutVals[RealRVLocIdx];
3668 ReadFirstLane, Arg);
3675 if (!Info->isEntryFunction()) {
3681 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3683 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3784 auto &ArgUsageInfo =
3786 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3812 const auto [OutgoingArg, ArgRC, ArgTy] =
3817 const auto [IncomingArg, IncomingArgRC, Ty] =
3819 assert(IncomingArgRC == ArgRC);
3822 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3830 InputReg = getImplicitArgPtr(DAG,
DL);
3832 std::optional<uint32_t> Id =
3834 if (Id.has_value()) {
3845 if (OutgoingArg->isRegister()) {
3846 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3847 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3850 unsigned SpecialArgOffset =
3861 auto [OutgoingArg, ArgRC, Ty] =
3864 std::tie(OutgoingArg, ArgRC, Ty) =
3867 std::tie(OutgoingArg, ArgRC, Ty) =
3882 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3883 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3884 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3889 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3897 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
3907 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
3916 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3917 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3928 : IncomingArgY ? *IncomingArgY
3935 if (OutgoingArg->isRegister()) {
3937 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3963 if (Callee->isDivergent())
3970 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3974 if (!CallerPreserved)
3977 bool CCMatch = CallerCC == CalleeCC;
3990 if (Arg.hasByValAttr())
4004 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
4005 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4014 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4027 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4029 if (!CCVA.isRegLoc())
4034 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4036 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4060enum ChainCallArgIdx {
4082 bool UsesDynamicVGPRs =
false;
4083 if (IsChainCallConv) {
4088 auto RequestedExecIt =
4090 return Arg.OrigArgIndex == 2;
4092 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4094 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4097 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4100 "Haven't popped all the special args");
4103 CLI.
Args[ChainCallArgIdx::Exec];
4104 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4112 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4114 ChainCallSpecialArgs.
push_back(Arg.Node);
4117 PushNodeOrTargetConstant(RequestedExecArg);
4123 if (FlagsValue.
isZero()) {
4124 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4126 "no additional args allowed if flags == 0");
4128 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4132 if (!Subtarget->isWave32()) {
4134 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4137 UsesDynamicVGPRs =
true;
4138 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4139 CLI.
Args.end(), PushNodeOrTargetConstant);
4148 bool IsSibCall =
false;
4162 "unsupported call to variadic function ");
4170 "unsupported required tail call to function ");
4175 Outs, OutVals, Ins, DAG);
4179 "site marked musttail or on llvm.amdgcn.cs.chain");
4186 if (!TailCallOpt && IsTailCall)
4226 auto *
TRI = Subtarget->getRegisterInfo();
4233 if (!IsSibCall || IsChainCallConv) {
4234 if (!Subtarget->enableFlatScratch()) {
4240 RegsToPass.emplace_back(IsChainCallConv
4241 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4242 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4249 const unsigned NumSpecialInputs = RegsToPass.size();
4251 MVT PtrVT = MVT::i32;
4254 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4282 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4290 int32_t
Offset = LocMemOffset;
4297 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4303 ? Flags.getNonZeroByValAlign()
4330 if (Outs[i].Flags.isByVal()) {
4332 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4335 Outs[i].Flags.getNonZeroByValAlign(),
4337 nullptr, std::nullopt, DstInfo,
4343 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4349 if (!MemOpChains.
empty())
4357 TokenGlue = DAG.
getNode(ISD::CONVERGENCECTRL_GLUE,
DL, MVT::Glue,
4365 unsigned ArgIdx = 0;
4366 for (
auto [Reg, Val] : RegsToPass) {
4367 if (ArgIdx++ >= NumSpecialInputs &&
4368 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4394 if (IsTailCall && !IsSibCall) {
4399 std::vector<SDValue>
Ops({Chain});
4405 Ops.push_back(Callee);
4422 Ops.push_back(Callee);
4433 if (IsChainCallConv)
4438 for (
auto &[Reg, Val] : RegsToPass)
4442 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4443 assert(Mask &&
"Missing call preserved mask for calling convention");
4453 MVT::Glue, GlueOps),
4458 Ops.push_back(InGlue);
4478 if (Info->isWholeWaveFunction())
4486 Chain =
Call.getValue(0);
4487 InGlue =
Call.getValue(1);
4489 uint64_t CalleePopBytes = NumBytes;
4510 EVT VT =
Op.getValueType();
4524 "Stack grows upwards for AMDGPU");
4526 Chain = BaseAddr.getValue(1);
4528 if (Alignment > StackAlign) {
4530 << Subtarget->getWavefrontSizeLog2();
4531 uint64_t StackAlignMask = ScaledAlignment - 1;
4538 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4544 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4555 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4571 if (
Op.getValueType() != MVT::i32)
4590 assert(
Op.getValueType() == MVT::i32);
4599 Op.getOperand(0), IntrinID, GetRoundBothImm);
4633 SDValue RoundModeTimesNumBits =
4653 TableEntry, EnumOffset);
4669 static_cast<uint32_t>(ConstMode->getZExtValue()),
4681 if (UseReducedTable) {
4687 SDValue RoundModeTimesNumBits =
4707 SDValue RoundModeTimesNumBits =
4716 NewMode = TruncTable;
4725 ReadFirstLaneID, NewMode);
4738 IntrinID, RoundBothImm, NewMode);
4744 if (
Op->isDivergent() &&
4745 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4755 if (Subtarget->hasSafeSmemPrefetch())
4763 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4772 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4773 EVT SrcVT = Src.getValueType();
4782 EVT DstVT =
Op.getValueType();
4786 return DAG.
getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4791 if (
Op.getValueType() != MVT::i64)
4805 Op.getOperand(0), IntrinID, ModeHwRegImm);
4807 Op.getOperand(0), IntrinID, TrapHwRegImm);
4814 SDValue Result = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4821 if (
Op.getOperand(1).getValueType() != MVT::i64)
4833 ReadFirstLaneID, NewModeReg);
4835 ReadFirstLaneID, NewTrapReg);
4837 unsigned ModeHwReg =
4840 unsigned TrapHwReg =
4848 IntrinID, ModeHwRegImm, NewModeReg);
4851 IntrinID, TrapHwRegImm, NewTrapReg);
4860 .
Case(
"m0", AMDGPU::M0)
4861 .
Case(
"exec", AMDGPU::EXEC)
4862 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4863 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4864 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4865 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4866 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4871 if (!Subtarget->hasFlatScrRegister() &&
4872 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4874 "\" for subtarget."));
4879 case AMDGPU::EXEC_LO:
4880 case AMDGPU::EXEC_HI:
4881 case AMDGPU::FLAT_SCR_LO:
4882 case AMDGPU::FLAT_SCR_HI:
4887 case AMDGPU::FLAT_SCR:
4906 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4915static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4937 auto Next = std::next(
I);
4948 MBB.addSuccessor(LoopBB);
4950 return std::pair(LoopBB, RemainderBB);
4957 auto I =
MI.getIterator();
4958 auto E = std::next(
I);
4980 Src->setIsKill(
false);
4990 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
4996 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4999 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5023 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5024 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5033 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
5034 Register NewExec =
MRI.createVirtualRegister(BoolRC);
5036 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5037 Register CondReg =
MRI.createVirtualRegister(BoolRC);
5045 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5052 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5056 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5062 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
5063 : AMDGPU::S_AND_SAVEEXEC_B64),
5067 MRI.setSimpleHint(NewExec, CondReg);
5069 if (UseGPRIdxMode) {
5071 SGPRIdxReg = CurrentIdxReg;
5073 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5074 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5084 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5091 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5094 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
5095 : AMDGPU::S_XOR_B64_term),
5119 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5120 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5128 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5130 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
5131 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
5132 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5133 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5148 InitResultReg, DstReg, PhiReg, TmpExec,
5149 Offset, UseGPRIdxMode, SGPRIdxReg);
5155 LoopBB->removeSuccessor(RemainderBB);
5157 LoopBB->addSuccessor(LandingPad);
5168static std::pair<unsigned, int>
5172 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5177 return std::pair(AMDGPU::sub0,
Offset);
5217 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5234 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5235 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5244 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5247 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5251 if (UseGPRIdxMode) {
5258 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5271 MI.eraseFromParent();
5280 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5281 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5287 UseGPRIdxMode, SGPRIdxReg);
5291 if (UseGPRIdxMode) {
5293 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5295 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5300 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5305 MI.eraseFromParent();
5322 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5332 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5334 if (Idx->
getReg() == AMDGPU::NoRegister) {
5345 MI.eraseFromParent();
5350 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5354 if (UseGPRIdxMode) {
5358 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5367 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5368 TRI.getRegSizeInBits(*VecRC), 32,
false);
5374 MI.eraseFromParent();
5384 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5388 UseGPRIdxMode, SGPRIdxReg);
5391 if (UseGPRIdxMode) {
5393 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5395 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5401 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5402 TRI.getRegSizeInBits(*VecRC), 32,
false);
5403 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5409 MI.eraseFromParent();
5425 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5426 if (ST.hasScalarAddSub64()) {
5427 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5437 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5438 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5441 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5443 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5446 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5448 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5450 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5451 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5460 MI.eraseFromParent();
5466 case AMDGPU::S_MIN_U32:
5467 return std::numeric_limits<uint32_t>::max();
5468 case AMDGPU::S_MIN_I32:
5469 return std::numeric_limits<int32_t>::max();
5470 case AMDGPU::S_MAX_U32:
5471 return std::numeric_limits<uint32_t>::min();
5472 case AMDGPU::S_MAX_I32:
5473 return std::numeric_limits<int32_t>::min();
5474 case AMDGPU::S_ADD_I32:
5475 case AMDGPU::S_SUB_I32:
5476 case AMDGPU::S_OR_B32:
5477 case AMDGPU::S_XOR_B32:
5478 return std::numeric_limits<uint32_t>::min();
5479 case AMDGPU::S_AND_B32:
5480 return std::numeric_limits<uint32_t>::max();
5483 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5489 case AMDGPU::V_CMP_LT_U64_e64:
5490 return std::numeric_limits<uint64_t>::max();
5491 case AMDGPU::V_CMP_LT_I64_e64:
5492 return std::numeric_limits<int64_t>::max();
5493 case AMDGPU::V_CMP_GT_U64_e64:
5494 return std::numeric_limits<uint64_t>::min();
5495 case AMDGPU::V_CMP_GT_I64_e64:
5496 return std::numeric_limits<int64_t>::min();
5497 case AMDGPU::S_ADD_U64_PSEUDO:
5498 case AMDGPU::S_SUB_U64_PSEUDO:
5499 case AMDGPU::S_OR_B64:
5500 case AMDGPU::S_XOR_B64:
5501 return std::numeric_limits<uint64_t>::min();
5502 case AMDGPU::S_AND_B64:
5503 return std::numeric_limits<uint64_t>::max();
5506 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5511 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5512 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5513 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5514 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5515 Opc == AMDGPU::S_XOR_B32;
5529 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5534 case AMDGPU::S_MIN_U32:
5535 case AMDGPU::S_MIN_I32:
5536 case AMDGPU::S_MAX_U32:
5537 case AMDGPU::S_MAX_I32:
5538 case AMDGPU::S_AND_B32:
5539 case AMDGPU::S_OR_B32: {
5545 case AMDGPU::V_CMP_LT_U64_e64:
5546 case AMDGPU::V_CMP_LT_I64_e64:
5547 case AMDGPU::V_CMP_GT_U64_e64:
5548 case AMDGPU::V_CMP_GT_I64_e64:
5549 case AMDGPU::S_AND_B64:
5550 case AMDGPU::S_OR_B64: {
5556 case AMDGPU::S_XOR_B32:
5557 case AMDGPU::S_XOR_B64:
5558 case AMDGPU::S_ADD_I32:
5559 case AMDGPU::S_ADD_U64_PSEUDO:
5560 case AMDGPU::S_SUB_I32:
5561 case AMDGPU::S_SUB_U64_PSEUDO: {
5564 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5566 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5568 bool IsWave32 = ST.isWave32();
5569 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5570 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5571 unsigned BitCountOpc =
5572 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5576 auto NewAccumulator =
5581 case AMDGPU::S_XOR_B32:
5582 case AMDGPU::S_XOR_B64: {
5588 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5591 .
addReg(NewAccumulator->getOperand(0).getReg())
5594 if (
Opc == AMDGPU::S_XOR_B32) {
5600 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5602 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5606 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5609 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5611 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5621 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5629 case AMDGPU::S_SUB_I32: {
5630 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5638 .
addReg(NewAccumulator->getOperand(0).getReg());
5641 case AMDGPU::S_ADD_I32: {
5644 .
addReg(NewAccumulator->getOperand(0).getReg());
5647 case AMDGPU::S_ADD_U64_PSEUDO:
5648 case AMDGPU::S_SUB_U64_PSEUDO: {
5649 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5650 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5652 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5654 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5655 Register CarryReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5656 Register AddReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5658 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5660 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5664 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5667 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5669 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5671 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5674 .
addReg(NewAccumulator->getOperand(0).getReg())
5684 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5686 : NewAccumulator->getOperand(0).getReg();
5697 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5703 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5709 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5741 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5742 Register IdentityValReg =
MRI.createVirtualRegister(DstRegClass);
5743 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5744 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5745 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5746 Register FF1Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5747 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
5749 bool IsWave32 = ST.isWave32();
5750 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5751 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5758 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5762 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5771 I = ComputeLoop->begin();
5773 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5777 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5781 I = ComputeLoop->end();
5784 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5788 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5797 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5799 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5800 Register LaneValReg =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5803 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5805 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5807 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5809 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5813 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5817 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5818 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5824 case AMDGPU::S_OR_B64:
5825 case AMDGPU::S_AND_B64:
5826 case AMDGPU::S_XOR_B64: {
5829 .
addReg(LaneValue->getOperand(0).getReg())
5833 case AMDGPU::V_CMP_GT_I64_e64:
5834 case AMDGPU::V_CMP_GT_U64_e64:
5835 case AMDGPU::V_CMP_LT_I64_e64:
5836 case AMDGPU::V_CMP_LT_U64_e64: {
5837 Register LaneMaskReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5839 MRI.createVirtualRegister(WaveMaskRegClass);
5842 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5843 Register AccumulatorVReg =
MRI.createVirtualRegister(VregClass);
5846 VregClass, AMDGPU::sub0, VSubRegClass);
5849 VregClass, AMDGPU::sub1, VSubRegClass);
5850 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
5857 .
addReg(LaneValue->getOperand(0).getReg())
5858 .
addReg(AccumulatorVReg);
5860 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5861 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
5865 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5866 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5867 .
addReg(LaneValue->getOperand(0).getReg())
5871 case AMDGPU::S_ADD_U64_PSEUDO:
5872 case AMDGPU::S_SUB_U64_PSEUDO: {
5875 .
addReg(LaneValue->getOperand(0).getReg());
5882 unsigned BITSETOpc =
5883 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5884 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5890 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5893 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5895 .
addReg(NewActiveBitsReg)
5897 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5902 MI.eraseFromParent();
5914 switch (
MI.getOpcode()) {
5915 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5917 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5919 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5921 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5923 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5925 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5927 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5929 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5931 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5933 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5935 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5937 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5939 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5941 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5943 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5945 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5947 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5949 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5951 case AMDGPU::S_UADDO_PSEUDO:
5952 case AMDGPU::S_USUBO_PSEUDO: {
5959 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5961 : AMDGPU::S_SUB_I32;
5972 MI.eraseFromParent();
5975 case AMDGPU::S_ADD_U64_PSEUDO:
5976 case AMDGPU::S_SUB_U64_PSEUDO: {
5979 case AMDGPU::V_ADD_U64_PSEUDO:
5980 case AMDGPU::V_SUB_U64_PSEUDO: {
5986 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5992 if (ST.hasAddSubU64Insts()) {
5994 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5995 : AMDGPU::V_SUB_U64_e64),
6000 TII->legalizeOperands(*
I);
6001 MI.eraseFromParent();
6005 if (IsAdd && ST.hasLshlAddU64Inst()) {
6011 TII->legalizeOperands(*
Add);
6012 MI.eraseFromParent();
6016 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6018 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6019 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6021 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
6022 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
6026 : &AMDGPU::VReg_64RegClass;
6029 : &AMDGPU::VReg_64RegClass;
6032 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6034 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6037 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6039 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6042 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6044 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6047 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6054 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6068 TII->legalizeOperands(*LoHalf);
6069 TII->legalizeOperands(*HiHalf);
6070 MI.eraseFromParent();
6073 case AMDGPU::S_ADD_CO_PSEUDO:
6074 case AMDGPU::S_SUB_CO_PSEUDO: {
6088 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
6089 ? AMDGPU::S_ADDC_U32
6090 : AMDGPU::S_SUBB_U32;
6092 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6093 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6098 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6099 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6103 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6105 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6111 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
6112 assert(WaveSize == 64 || WaveSize == 32);
6114 if (WaveSize == 64) {
6115 if (ST.hasScalarCompareEq64()) {
6121 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6123 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6125 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6126 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6128 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6149 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6155 MI.eraseFromParent();
6158 case AMDGPU::SI_INIT_M0: {
6161 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6164 MI.eraseFromParent();
6167 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6170 TII->get(AMDGPU::S_CMP_EQ_U32))
6175 case AMDGPU::GET_GROUPSTATICSIZE: {
6180 .
add(
MI.getOperand(0))
6182 MI.eraseFromParent();
6185 case AMDGPU::GET_SHADERCYCLESHILO: {
6200 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6202 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6203 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6205 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6206 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6208 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6212 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6217 .
add(
MI.getOperand(0))
6222 MI.eraseFromParent();
6225 case AMDGPU::SI_INDIRECT_SRC_V1:
6226 case AMDGPU::SI_INDIRECT_SRC_V2:
6227 case AMDGPU::SI_INDIRECT_SRC_V4:
6228 case AMDGPU::SI_INDIRECT_SRC_V8:
6229 case AMDGPU::SI_INDIRECT_SRC_V9:
6230 case AMDGPU::SI_INDIRECT_SRC_V10:
6231 case AMDGPU::SI_INDIRECT_SRC_V11:
6232 case AMDGPU::SI_INDIRECT_SRC_V12:
6233 case AMDGPU::SI_INDIRECT_SRC_V16:
6234 case AMDGPU::SI_INDIRECT_SRC_V32:
6236 case AMDGPU::SI_INDIRECT_DST_V1:
6237 case AMDGPU::SI_INDIRECT_DST_V2:
6238 case AMDGPU::SI_INDIRECT_DST_V4:
6239 case AMDGPU::SI_INDIRECT_DST_V8:
6240 case AMDGPU::SI_INDIRECT_DST_V9:
6241 case AMDGPU::SI_INDIRECT_DST_V10:
6242 case AMDGPU::SI_INDIRECT_DST_V11:
6243 case AMDGPU::SI_INDIRECT_DST_V12:
6244 case AMDGPU::SI_INDIRECT_DST_V16:
6245 case AMDGPU::SI_INDIRECT_DST_V32:
6247 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6248 case AMDGPU::SI_KILL_I1_PSEUDO:
6250 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6259 Register SrcCond =
MI.getOperand(3).getReg();
6261 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6262 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6263 const auto *CondRC =
TRI->getWaveMaskRegClass();
6264 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
6268 : &AMDGPU::VReg_64RegClass;
6271 : &AMDGPU::VReg_64RegClass;
6274 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6276 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6279 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6281 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6284 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6286 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6307 MI.eraseFromParent();
6310 case AMDGPU::SI_BR_UNDEF: {
6314 .
add(
MI.getOperand(0));
6316 MI.eraseFromParent();
6319 case AMDGPU::ADJCALLSTACKUP:
6320 case AMDGPU::ADJCALLSTACKDOWN: {
6327 case AMDGPU::SI_CALL_ISEL: {
6331 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
6334 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6340 MI.eraseFromParent();
6343 case AMDGPU::V_ADD_CO_U32_e32:
6344 case AMDGPU::V_SUB_CO_U32_e32:
6345 case AMDGPU::V_SUBREV_CO_U32_e32: {
6348 unsigned Opc =
MI.getOpcode();
6350 bool NeedClampOperand =
false;
6351 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
6353 NeedClampOperand =
true;
6357 if (
TII->isVOP3(*
I)) {
6362 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
6363 if (NeedClampOperand)
6366 TII->legalizeOperands(*
I);
6368 MI.eraseFromParent();
6371 case AMDGPU::V_ADDC_U32_e32:
6372 case AMDGPU::V_SUBB_U32_e32:
6373 case AMDGPU::V_SUBBREV_U32_e32:
6376 TII->legalizeOperands(
MI);
6378 case AMDGPU::DS_GWS_INIT:
6379 case AMDGPU::DS_GWS_SEMA_BR:
6380 case AMDGPU::DS_GWS_BARRIER:
6381 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
6383 case AMDGPU::DS_GWS_SEMA_V:
6384 case AMDGPU::DS_GWS_SEMA_P:
6385 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6393 case AMDGPU::S_SETREG_B32: {
6409 const unsigned SetMask = WidthMask <<
Offset;
6412 unsigned SetDenormOp = 0;
6413 unsigned SetRoundOp = 0;
6421 SetRoundOp = AMDGPU::S_ROUND_MODE;
6422 SetDenormOp = AMDGPU::S_DENORM_MODE;
6424 SetRoundOp = AMDGPU::S_ROUND_MODE;
6426 SetDenormOp = AMDGPU::S_DENORM_MODE;
6429 if (SetRoundOp || SetDenormOp) {
6432 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6433 unsigned ImmVal = Def->getOperand(1).getImm();
6447 MI.eraseFromParent();
6456 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6460 case AMDGPU::S_INVERSE_BALLOT_U32:
6461 case AMDGPU::S_INVERSE_BALLOT_U64:
6464 MI.setDesc(
TII->get(AMDGPU::COPY));
6466 case AMDGPU::ENDPGM_TRAP: {
6469 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6489 MI.eraseFromParent();
6492 case AMDGPU::SIMULATED_TRAP: {
6493 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6496 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6497 MI.eraseFromParent();
6500 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6501 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6507 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6508 Register OriginalExec = Setup->getOperand(0).getReg();
6510 MI.getOperand(0).setReg(OriginalExec);
6547 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6551 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6578 if (!Subtarget->hasMadMacF32Insts())
6579 return Subtarget->hasFastFMAF32();
6585 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6588 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6604 switch (Ty.getScalarSizeInBits()) {
6622 if (Ty.getScalarSizeInBits() == 16)
6624 if (Ty.getScalarSizeInBits() == 32)
6625 return Subtarget->hasMadMacF32Insts() &&
6635 EVT VT =
N->getValueType(0);
6637 return Subtarget->hasMadMacF32Insts() &&
6639 if (VT == MVT::f16) {
6640 return Subtarget->hasMadF16() &&
6655 unsigned Opc =
Op.getOpcode();
6656 EVT VT =
Op.getValueType();
6657 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6658 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6659 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6660 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6676 [[maybe_unused]]
EVT VT =
Op.getValueType();
6678 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6679 VT == MVT::v16i32) &&
6680 "Unexpected ValueType.");
6689 unsigned Opc =
Op.getOpcode();
6690 EVT VT =
Op.getValueType();
6691 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6692 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6693 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6694 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6695 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6696 VT == MVT::v32bf16);
6704 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6706 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6713 unsigned Opc =
Op.getOpcode();
6714 EVT VT =
Op.getValueType();
6715 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6716 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6717 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6718 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6719 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6720 VT == MVT::v32bf16);
6725 : std::pair(Op0, Op0);
6734 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
6736 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
6742 switch (
Op.getOpcode()) {
6746 return LowerBRCOND(
Op, DAG);
6748 return LowerRETURNADDR(
Op, DAG);
6751 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6752 "Load should return a value and a chain");
6756 EVT VT =
Op.getValueType();
6758 return lowerFSQRTF32(
Op, DAG);
6760 return lowerFSQRTF64(
Op, DAG);
6765 return LowerTrig(
Op, DAG);
6767 return LowerSELECT(
Op, DAG);
6769 return LowerFDIV(
Op, DAG);
6771 return LowerFFREXP(
Op, DAG);
6772 case ISD::ATOMIC_CMP_SWAP:
6773 return LowerATOMIC_CMP_SWAP(
Op, DAG);
6775 return LowerSTORE(
Op, DAG);
6779 return LowerGlobalAddress(MFI,
Op, DAG);
6782 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
6784 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
6786 return LowerINTRINSIC_VOID(
Op, DAG);
6787 case ISD::ADDRSPACECAST:
6788 return lowerADDRSPACECAST(
Op, DAG);
6790 return lowerINSERT_SUBVECTOR(
Op, DAG);
6792 return lowerINSERT_VECTOR_ELT(
Op, DAG);
6794 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
6796 return lowerVECTOR_SHUFFLE(
Op, DAG);
6798 return lowerSCALAR_TO_VECTOR(
Op, DAG);
6800 return lowerBUILD_VECTOR(
Op, DAG);
6803 return lowerFP_ROUND(
Op, DAG);
6805 return lowerTRAP(
Op, DAG);
6806 case ISD::DEBUGTRAP:
6807 return lowerDEBUGTRAP(
Op, DAG);
6816 return lowerFMINNUM_FMAXNUM(
Op, DAG);
6817 case ISD::FMINIMUMNUM:
6818 case ISD::FMAXIMUMNUM:
6819 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
6822 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
6825 return lowerFLDEXP(
Op, DAG);
6842 case ISD::FMINNUM_IEEE:
6843 case ISD::FMAXNUM_IEEE:
6850 return lowerFCOPYSIGN(
Op, DAG);
6852 return lowerMUL(
Op, DAG);
6855 return lowerXMULO(
Op, DAG);
6858 return lowerXMUL_LOHI(
Op, DAG);
6859 case ISD::DYNAMIC_STACKALLOC:
6861 case ISD::STACKSAVE:
6865 case ISD::SET_ROUNDING:
6869 case ISD::FP_EXTEND:
6872 case ISD::GET_FPENV:
6874 case ISD::SET_FPENV:
6893 EVT FittingLoadVT = LoadVT;
6918 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6922 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6925SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6928 bool IsIntrinsic)
const {
6931 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6932 EVT LoadVT =
M->getValueType(0);
6934 EVT EquivLoadVT = LoadVT;
6948 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
6952 M->getMemoryVT(),
M->getMemOperand());
6963 EVT LoadVT =
M->getValueType(0);
6969 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
6970 bool IsTFE =
M->getNumValues() == 3;
6983 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
6987 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
6988 M->getMemOperand(), DAG);
6992 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
6994 M->getMemOperand(), DAG);
7002 EVT VT =
N->getValueType(0);
7003 unsigned CondCode =
N->getConstantOperandVal(3);
7014 EVT CmpVT =
LHS.getValueType();
7015 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
7016 unsigned PromoteOp =
7036 EVT VT =
N->getValueType(0);
7038 unsigned CondCode =
N->getConstantOperandVal(3);
7047 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7048 Src0 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7049 Src1 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7065 EVT VT =
N->getValueType(0);
7072 Src.getOperand(1), Src.getOperand(2));
7083 Exec = AMDGPU::EXEC_LO;
7085 Exec = AMDGPU::EXEC;
7102 EVT VT =
N->getValueType(0);
7104 unsigned IID =
N->getConstantOperandVal(0);
7105 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7106 IID == Intrinsic::amdgcn_permlanex16;
7107 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7108 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7112 unsigned SplitSize = 32;
7113 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7114 ST->hasDPALU_DPP() &&
7122 case Intrinsic::amdgcn_permlane16:
7123 case Intrinsic::amdgcn_permlanex16:
7124 case Intrinsic::amdgcn_update_dpp:
7129 case Intrinsic::amdgcn_writelane:
7132 case Intrinsic::amdgcn_readlane:
7133 case Intrinsic::amdgcn_set_inactive:
7134 case Intrinsic::amdgcn_set_inactive_chain_arg:
7135 case Intrinsic::amdgcn_mov_dpp8:
7138 case Intrinsic::amdgcn_readfirstlane:
7139 case Intrinsic::amdgcn_permlane64:
7149 if (
SDNode *GL =
N->getGluedNode()) {
7150 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7151 GL = GL->getOperand(0).getNode();
7152 Operands.push_back(DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7161 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7162 IID == Intrinsic::amdgcn_mov_dpp8 ||
7163 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7164 Src1 =
N->getOperand(2);
7165 if (IID == Intrinsic::amdgcn_writelane ||
7166 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7167 Src2 =
N->getOperand(3);
7170 if (ValSize == SplitSize) {
7180 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7185 if (IID == Intrinsic::amdgcn_writelane) {
7190 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7192 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7195 if (ValSize % SplitSize != 0)
7199 EVT VT =
N->getValueType(0);
7203 unsigned NumOperands =
N->getNumOperands();
7205 SDNode *GL =
N->getGluedNode();
7210 for (
unsigned i = 0; i != NE; ++i) {
7211 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7213 SDValue Operand =
N->getOperand(j);
7228 DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7243 if (SplitSize == 32) {
7245 return unrollLaneOp(LaneOp.
getNode());
7251 unsigned SubVecNumElt =
7255 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7256 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7260 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7265 if (IID == Intrinsic::amdgcn_writelane)
7270 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7271 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7272 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7273 EltIdx += SubVecNumElt;
7287 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7290 if (IID == Intrinsic::amdgcn_writelane)
7293 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7301 switch (
N->getOpcode()) {
7313 unsigned IID =
N->getConstantOperandVal(0);
7315 case Intrinsic::amdgcn_make_buffer_rsrc:
7316 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
7318 case Intrinsic::amdgcn_cvt_pkrtz: {
7324 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7327 case Intrinsic::amdgcn_cvt_pknorm_i16:
7328 case Intrinsic::amdgcn_cvt_pknorm_u16:
7329 case Intrinsic::amdgcn_cvt_pk_i16:
7330 case Intrinsic::amdgcn_cvt_pk_u16: {
7336 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7338 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7340 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7345 EVT VT =
N->getValueType(0);
7350 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7354 case Intrinsic::amdgcn_s_buffer_load: {
7360 if (!Subtarget->hasScalarSubwordLoads())
7366 EVT VT =
Op.getValueType();
7367 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
7379 if (!
Offset->isDivergent()) {
7398 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
7403 case Intrinsic::amdgcn_dead: {
7404 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7415 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7416 Results.push_back(Res.getOperand(
I));
7420 Results.push_back(Res.getValue(1));
7429 EVT VT =
N->getValueType(0);
7434 EVT SelectVT = NewVT;
7435 if (NewVT.
bitsLT(MVT::i32)) {
7438 SelectVT = MVT::i32;
7444 if (NewVT != SelectVT)
7450 if (
N->getValueType(0) != MVT::v2f16)
7454 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7462 if (
N->getValueType(0) != MVT::v2f16)
7466 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7474 if (
N->getValueType(0) != MVT::f16)
7489 if (U.get() !=
Value)
7492 if (U.getUser()->getOpcode() == Opcode)
7498unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
7501 case Intrinsic::amdgcn_if:
7503 case Intrinsic::amdgcn_else:
7505 case Intrinsic::amdgcn_loop:
7507 case Intrinsic::amdgcn_end_cf:
7527 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7554 SDNode *Intr = BRCOND.getOperand(1).getNode();
7567 assert(BR &&
"brcond missing unconditional branch user");
7571 unsigned CFNode = isCFIntrinsic(Intr);
7591 Ops.push_back(Target);
7614 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
7633 MVT VT =
Op.getSimpleValueType();
7636 if (
Op.getConstantOperandVal(0) != 0)
7640 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7642 if (
Info->isEntryFunction())
7659 return Op.getValueType().bitsLE(VT)
7667 EVT DstVT =
Op.getValueType();
7674 unsigned Opc =
Op.getOpcode();
7686 EVT SrcVT = Src.getValueType();
7687 EVT DstVT =
Op.getValueType();
7690 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
7693 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
7700 if (DstVT == MVT::f16) {
7705 if (!Subtarget->has16BitInsts()) {
7708 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7710 if (
Op->getFlags().hasApproximateFuncs()) {
7717 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7721 "custom lower FP_ROUND for f16 or bf16");
7722 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
7735 EVT VT =
Op.getValueType();
7737 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7738 bool IsIEEEMode =
Info->getMode().IEEE;
7747 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7754SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
7756 EVT VT =
Op.getValueType();
7758 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7759 bool IsIEEEMode =
Info->getMode().IEEE;
7764 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7772 EVT VT =
Op.getValueType();
7776 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7777 !Subtarget->hasMinimum3Maximum3F16() &&
7778 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7779 "should not need to widen f16 minimum/maximum to v2f16");
7793 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7801 EVT VT =
Op.getValueType();
7805 EVT ExpVT =
Exp.getValueType();
7806 if (ExpVT == MVT::i16)
7827 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
7830 return DAG.
getNode(ISD::FLDEXP,
DL, VT,
Op.getOperand(0), TruncExp);
7834 switch (
Op->getOpcode()) {
7864 DAGCombinerInfo &DCI)
const {
7865 const unsigned Opc =
Op.getOpcode();
7873 :
Op->getOperand(0).getValueType();
7876 if (DCI.isBeforeLegalizeOps() ||
7880 auto &DAG = DCI.DAG;
7886 LHS =
Op->getOperand(1);
7887 RHS =
Op->getOperand(2);
7889 LHS =
Op->getOperand(0);
7890 RHS =
Op->getOperand(1);
7929 if (MagVT == SignVT)
7936 SDValue SignAsInt32 = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7939 SDValue SignAsHalf16 = DAG.
getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7946 EVT VT =
Op.getValueType();
7952 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
7979 if (
Op->isDivergent())
7992 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7994 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7997 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7999 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8005 EVT VT =
Op.getValueType();
8012 const APInt &
C = RHSC->getAPIntValue();
8014 if (
C.isPowerOf2()) {
8016 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
8043 if (
Op->isDivergent()) {
8047 if (Subtarget->hasSMulHi()) {
8058 if (!Subtarget->isTrapHandlerEnabled() ||
8060 return lowerTrapEndpgm(
Op, DAG);
8062 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
8063 : lowerTrapHsaQueuePtr(
Op, DAG);
8073SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
8075 ImplicitParameter Param)
const {
8095 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
8098 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8101 if (UserSGPR == AMDGPU::NoRegister) {
8127 if (Subtarget->hasPrivEnabledTrap2NopBug())
8140 if (!Subtarget->isTrapHandlerEnabled() ||
8144 "debugtrap handler not supported",
8155SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
8157 if (Subtarget->hasApertureRegs()) {
8159 ? AMDGPU::SRC_SHARED_BASE
8160 : AMDGPU::SRC_PRIVATE_BASE;
8161 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8162 !Subtarget->hasGloballyAddressableScratch()) &&
8163 "Cannot use src_private_base with globally addressable scratch!");
8186 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
8195 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
8199 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8201 if (UserSGPR == AMDGPU::NoRegister) {
8235 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8246 const AMDGPUTargetMachine &TM =
8249 unsigned DestAS, SrcAS;
8251 bool IsNonNull =
false;
8253 SrcAS = ASC->getSrcAddressSpace();
8254 Src = ASC->getOperand(0);
8255 DestAS = ASC->getDestAddressSpace();
8258 Op.getConstantOperandVal(0) ==
8259 Intrinsic::amdgcn_addrspacecast_nonnull);
8260 Src =
Op->getOperand(1);
8261 SrcAS =
Op->getConstantOperandVal(2);
8262 DestAS =
Op->getConstantOperandVal(3);
8275 Subtarget->hasGloballyAddressableScratch()) {
8280 AMDGPU::S_MOV_B32, SL, MVT::i32,
8281 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8289 unsigned NullVal = TM.getNullPointerValue(DestAS);
8304 Subtarget->hasGloballyAddressableScratch()) {
8313 if (Subtarget->isWave64())
8319 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8322 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8327 AMDGPU::S_MOV_B64, SL, MVT::i64,
8328 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8330 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8332 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8334 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8340 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8352 Op.getValueType() == MVT::i64) {
8353 const SIMachineFunctionInfo *
Info =
8357 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8361 Src.getValueType() == MVT::i64)
8381 EVT InsVT =
Ins.getValueType();
8389 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
8394 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8396 MVT::i32, InsNumElts / 2);
8398 Vec = DAG.
getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8399 Ins = DAG.
getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8401 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
8403 if (InsNumElts == 2) {
8413 return DAG.
getNode(ISD::BITCAST, SL, VecVT, Vec);
8416 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
8439 if (NumElts == 4 && EltSize == 16 && KIdx) {
8447 SDValue LoVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8448 SDValue HiVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8450 unsigned Idx = KIdx->getZExtValue();
8451 bool InsertLo = Idx < 2;
8454 DAG.
getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8455 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8457 InsHalf = DAG.
getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8461 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8474 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
8502 return DAG.
getNode(ISD::BITCAST, SL, VecVT, BFI);
8509 EVT ResultVT =
Op.getValueType();
8522 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8525 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8529 if (VecSize == 128) {
8537 }
else if (VecSize == 256) {
8540 for (
unsigned P = 0;
P < 4; ++
P) {
8546 Parts[0], Parts[1]));
8548 Parts[2], Parts[3]));
8554 for (
unsigned P = 0;
P < 8; ++
P) {
8561 Parts[0], Parts[1], Parts[2], Parts[3]));
8564 Parts[4], Parts[5], Parts[6], Parts[7]));
8584 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8599 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8601 return DAG.
getNode(ISD::BITCAST, SL, ResultVT, Result);
8609 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8614 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8615 !(Mask[Elt + 1] & 1);
8621 EVT ResultVT =
Op.getValueType();
8624 const int NewSrcNumElts = 2;
8626 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
8642 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
8664 if (ShouldUseConsecutiveExtract &&
8667 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8668 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8680 if (Idx0 >= SrcNumElts) {
8685 if (Idx1 >= SrcNumElts) {
8690 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8691 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8699 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8700 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8705 if (SubVec0 != SubVec1) {
8706 NewMaskIdx1 += NewSrcNumElts;
8713 {NewMaskIdx0, NewMaskIdx1});
8718 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8719 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8720 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8721 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8740 EVT ResultVT =
Op.getValueType();
8756 EVT VT =
Op.getValueType();
8758 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8759 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
8768 return DAG.
getNode(ISD::BITCAST, SL, VT, ExtLo);
8777 return DAG.
getNode(ISD::BITCAST, SL, VT, ShlHi);
8784 return DAG.
getNode(ISD::BITCAST, SL, VT,
Or);
8793 for (
unsigned P = 0;
P < NumParts; ++
P) {
8795 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
8801 return DAG.
getNode(ISD::BITCAST, SL, VT, Blend);
8814 if (!Subtarget->isAmdHsaOS())
8874 EVT PtrVT =
Op.getValueType();
8876 const GlobalValue *GV = GSD->
getGlobal();
8890 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
8908 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8909 if (Subtarget->has64BitLiterals()) {
8940 MachinePointerInfo PtrInfo =
8968 SDValue Param = lowerKernargMemParameter(
8979 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
8987 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
8995 unsigned NumElts = Elts.
size();
8997 if (NumElts <= 12) {
9006 for (
unsigned i = 0; i < Elts.
size(); ++i) {
9012 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
9022 EVT SrcVT = Src.getValueType();
9043 bool Unpacked,
bool IsD16,
int DMaskPop,
9044 int NumVDataDwords,
bool IsAtomicPacked16Bit,
9048 EVT ReqRetVT = ResultTypes[0];
9050 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9051 ? (ReqRetNumElts + 1) / 2
9054 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9065 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
9076 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
9078 NumDataDwords - MaskPopDwords);
9083 EVT LegalReqRetVT = ReqRetVT;
9085 if (!
Data.getValueType().isInteger())
9087 Data.getValueType().changeTypeToInteger(),
Data);
9108 if (Result->getNumValues() == 1)
9115 SDValue *LWE,
bool &IsTexFail) {
9135 unsigned DimIdx,
unsigned EndIdx,
9136 unsigned NumGradients) {
9138 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
9146 if (((
I + 1) >= EndIdx) ||
9147 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
9148 I == DimIdx + NumGradients - 1))) {
9167 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9181 int NumVDataDwords = 0;
9182 bool AdjustRetType =
false;
9183 bool IsAtomicPacked16Bit =
false;
9186 const unsigned ArgOffset = WithChain ? 2 : 1;
9189 unsigned DMaskLanes = 0;
9191 if (BaseOpcode->Atomic) {
9192 VData =
Op.getOperand(2);
9194 IsAtomicPacked16Bit =
9195 (Intr->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9196 Intr->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
9199 if (BaseOpcode->AtomicX2) {
9206 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9207 DMask = Is64Bit ? 0xf : 0x3;
9208 NumVDataDwords = Is64Bit ? 4 : 2;
9210 DMask = Is64Bit ? 0x3 : 0x1;
9211 NumVDataDwords = Is64Bit ? 2 : 1;
9214 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
9217 if (BaseOpcode->Store) {
9218 VData =
Op.getOperand(2);
9222 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9226 VData = handleD16VData(VData, DAG,
true);
9229 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
9230 }
else if (!BaseOpcode->NoReturn) {
9235 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9243 (!LoadVT.
isVector() && DMaskLanes > 1))
9249 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9250 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9251 NumVDataDwords = (DMaskLanes + 1) / 2;
9253 NumVDataDwords = DMaskLanes;
9255 AdjustRetType =
true;
9259 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
9266 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9267 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9269 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
9271 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9272 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9276 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
9282 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
9286 "Bias needs to be converted to 16 bit in A16 mode");
9291 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
9295 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
9296 "require 16 bit args for both gradients and addresses");
9301 if (!
ST->hasA16()) {
9302 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
9303 "support 16 bit addresses\n");
9313 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
9315 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9317 IntrOpcode = G16MappingInfo->
G16;
9340 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
9358 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
9359 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
9360 const bool UseNSA =
ST->hasNSAEncoding() &&
9361 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
9362 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
9363 const bool UsePartialNSA =
9364 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
9367 if (UsePartialNSA) {
9369 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9370 }
else if (!UseNSA) {
9377 if (!BaseOpcode->Sampler) {
9380 uint64_t UnormConst =
9381 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
9383 Unorm = UnormConst ? True : False;
9389 bool IsTexFail =
false;
9390 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9401 NumVDataDwords += 1;
9402 AdjustRetType =
true;
9407 if (AdjustRetType) {
9410 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9419 MVT::i32, NumVDataDwords)
9422 ResultTypes[0] = NewVT;
9423 if (ResultTypes.size() == 3) {
9427 ResultTypes.erase(&ResultTypes[1]);
9432 if (BaseOpcode->Atomic)
9439 if (BaseOpcode->Store || BaseOpcode->Atomic)
9440 Ops.push_back(VData);
9441 if (UsePartialNSA) {
9443 Ops.push_back(VAddr);
9447 Ops.push_back(VAddr);
9450 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9452 Ops.push_back(Rsrc);
9453 if (BaseOpcode->Sampler) {
9457 Ops.push_back(Samp);
9462 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9463 Ops.push_back(Unorm);
9465 Ops.push_back(IsA16 &&
9466 ST->hasFeature(AMDGPU::FeatureR128A16)
9470 Ops.push_back(IsA16 ? True : False);
9472 if (!Subtarget->hasGFX90AInsts())
9477 "TFE is not supported on this GPU",
DL.getDebugLoc()));
9480 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9483 Ops.push_back(DimInfo->
DA ? True : False);
9484 if (BaseOpcode->HasD16)
9485 Ops.push_back(IsD16 ? True : False);
9487 Ops.push_back(
Op.getOperand(0));
9489 int NumVAddrDwords =
9495 NumVDataDwords, NumVAddrDwords);
9496 }
else if (IsGFX11Plus) {
9498 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9499 : AMDGPU::MIMGEncGfx11Default,
9500 NumVDataDwords, NumVAddrDwords);
9501 }
else if (IsGFX10Plus) {
9503 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9504 : AMDGPU::MIMGEncGfx10Default,
9505 NumVDataDwords, NumVAddrDwords);
9507 if (Subtarget->hasGFX90AInsts()) {
9509 NumVDataDwords, NumVAddrDwords);
9513 "requested image instruction is not supported on this GPU",
9518 for (EVT VT : OrigResultTypes) {
9519 if (VT == MVT::Other)
9520 RetValues[Idx++] =
Op.getOperand(0);
9531 NumVDataDwords, NumVAddrDwords);
9534 NumVDataDwords, NumVAddrDwords);
9541 MachineMemOperand *MemRef = MemOp->getMemOperand();
9545 if (BaseOpcode->AtomicX2) {
9550 if (BaseOpcode->NoReturn)
9553 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9554 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9567 MachinePointerInfo(),
9572 if (!
Offset->isDivergent()) {
9579 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9588 !Subtarget->hasScalarDwordx3Loads()) {
9615 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9617 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
9621 unsigned NumLoads = 1;
9627 if (NumElts == 8 || NumElts == 16) {
9628 NumLoads = NumElts / 4;
9632 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
9637 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
9639 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
9640 for (
unsigned i = 0; i < NumLoads; ++i) {
9646 if (NumElts == 8 || NumElts == 16)
9654 if (!Subtarget->hasArchitectedSGPRs())
9666 unsigned Width)
const {
9668 using namespace AMDGPU::Hwreg;
9670 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9709 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
9711 EVT VT =
Op.getValueType();
9713 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
9717 switch (IntrinsicID) {
9718 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9721 return getPreloadedValue(DAG, *MFI, VT,
9724 case Intrinsic::amdgcn_dispatch_ptr:
9725 case Intrinsic::amdgcn_queue_ptr: {
9726 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
9728 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
9733 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9736 return getPreloadedValue(DAG, *MFI, VT, RegID);
9738 case Intrinsic::amdgcn_implicitarg_ptr: {
9740 return getImplicitArgPtr(DAG,
DL);
9741 return getPreloadedValue(DAG, *MFI, VT,
9744 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9750 return getPreloadedValue(DAG, *MFI, VT,
9753 case Intrinsic::amdgcn_dispatch_id: {
9756 case Intrinsic::amdgcn_rcp:
9758 case Intrinsic::amdgcn_rsq:
9760 case Intrinsic::amdgcn_rsq_legacy:
9764 case Intrinsic::amdgcn_rcp_legacy:
9768 case Intrinsic::amdgcn_rsq_clamp: {
9779 return DAG.
getNode(ISD::FMAXNUM,
DL, VT, Tmp,
9782 case Intrinsic::r600_read_ngroups_x:
9783 if (Subtarget->isAmdHsaOS())
9786 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9789 case Intrinsic::r600_read_ngroups_y:
9790 if (Subtarget->isAmdHsaOS())
9793 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9796 case Intrinsic::r600_read_ngroups_z:
9797 if (Subtarget->isAmdHsaOS())
9800 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9803 case Intrinsic::r600_read_local_size_x:
9804 if (Subtarget->isAmdHsaOS())
9807 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9809 case Intrinsic::r600_read_local_size_y:
9810 if (Subtarget->isAmdHsaOS())
9813 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9815 case Intrinsic::r600_read_local_size_z:
9816 if (Subtarget->isAmdHsaOS())
9819 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9821 case Intrinsic::amdgcn_workgroup_id_x:
9822 return lowerWorkGroupId(DAG, *MFI, VT,
9826 case Intrinsic::amdgcn_workgroup_id_y:
9827 return lowerWorkGroupId(DAG, *MFI, VT,
9831 case Intrinsic::amdgcn_workgroup_id_z:
9832 return lowerWorkGroupId(DAG, *MFI, VT,
9836 case Intrinsic::amdgcn_cluster_id_x:
9837 return Subtarget->hasClusters()
9838 ? getPreloadedValue(DAG, *MFI, VT,
9840 : DAG.getPOISON(VT);
9841 case Intrinsic::amdgcn_cluster_id_y:
9842 return Subtarget->hasClusters()
9843 ? getPreloadedValue(DAG, *MFI, VT,
9846 case Intrinsic::amdgcn_cluster_id_z:
9847 return Subtarget->hasClusters()
9848 ? getPreloadedValue(DAG, *MFI, VT,
9851 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9852 return Subtarget->hasClusters()
9853 ? getPreloadedValue(
9857 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9858 return Subtarget->hasClusters()
9859 ? getPreloadedValue(
9863 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9864 return Subtarget->hasClusters()
9865 ? getPreloadedValue(
9869 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9870 return Subtarget->hasClusters()
9873 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9874 return Subtarget->hasClusters()
9875 ? getPreloadedValue(
9879 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9880 return Subtarget->hasClusters()
9881 ? getPreloadedValue(
9885 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9886 return Subtarget->hasClusters()
9887 ? getPreloadedValue(
9891 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9892 return Subtarget->hasClusters()
9893 ? getPreloadedValue(
9897 case Intrinsic::amdgcn_wave_id:
9898 return lowerWaveID(DAG,
Op);
9899 case Intrinsic::amdgcn_lds_kernel_id: {
9901 return getLDSKernelId(DAG,
DL);
9902 return getPreloadedValue(DAG, *MFI, VT,
9905 case Intrinsic::amdgcn_workitem_id_x:
9906 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
9907 case Intrinsic::amdgcn_workitem_id_y:
9908 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
9909 case Intrinsic::amdgcn_workitem_id_z:
9910 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
9911 case Intrinsic::amdgcn_wavefrontsize:
9913 SDLoc(
Op), MVT::i32);
9914 case Intrinsic::amdgcn_s_buffer_load: {
9915 unsigned CPol =
Op.getConstantOperandVal(3);
9922 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
9923 Op.getOperand(3), DAG);
9925 case Intrinsic::amdgcn_fdiv_fast:
9926 return lowerFDIV_FAST(
Op, DAG);
9927 case Intrinsic::amdgcn_sin:
9930 case Intrinsic::amdgcn_cos:
9933 case Intrinsic::amdgcn_mul_u24:
9936 case Intrinsic::amdgcn_mul_i24:
9940 case Intrinsic::amdgcn_log_clamp: {
9946 case Intrinsic::amdgcn_fract:
9949 case Intrinsic::amdgcn_class:
9952 case Intrinsic::amdgcn_div_fmas:
9954 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9956 case Intrinsic::amdgcn_div_fixup:
9958 Op.getOperand(2),
Op.getOperand(3));
9960 case Intrinsic::amdgcn_div_scale: {
9973 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
9976 Denominator, Numerator);
9978 case Intrinsic::amdgcn_icmp: {
9980 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
9981 Op.getConstantOperandVal(2) == 0 &&
9986 case Intrinsic::amdgcn_fcmp: {
9989 case Intrinsic::amdgcn_ballot:
9991 case Intrinsic::amdgcn_fmed3:
9993 Op.getOperand(2),
Op.getOperand(3));
9994 case Intrinsic::amdgcn_fdot2:
9996 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9997 case Intrinsic::amdgcn_fmul_legacy:
10000 case Intrinsic::amdgcn_sffbh:
10002 case Intrinsic::amdgcn_sbfe:
10004 Op.getOperand(2),
Op.getOperand(3));
10005 case Intrinsic::amdgcn_ubfe:
10007 Op.getOperand(2),
Op.getOperand(3));
10008 case Intrinsic::amdgcn_cvt_pkrtz:
10009 case Intrinsic::amdgcn_cvt_pknorm_i16:
10010 case Intrinsic::amdgcn_cvt_pknorm_u16:
10011 case Intrinsic::amdgcn_cvt_pk_i16:
10012 case Intrinsic::amdgcn_cvt_pk_u16: {
10014 EVT VT =
Op.getValueType();
10017 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10019 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10021 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10023 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10029 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
10032 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
10033 return DAG.
getNode(ISD::BITCAST,
DL, VT, Node);
10035 case Intrinsic::amdgcn_fmad_ftz:
10037 Op.getOperand(2),
Op.getOperand(3));
10039 case Intrinsic::amdgcn_if_break:
10041 Op->getOperand(1),
Op->getOperand(2)),
10044 case Intrinsic::amdgcn_groupstaticsize: {
10050 const GlobalValue *GV =
10056 case Intrinsic::amdgcn_is_shared:
10057 case Intrinsic::amdgcn_is_private: {
10060 DAG.
getNode(ISD::BITCAST,
DL, MVT::v2i32,
Op.getOperand(1));
10064 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10068 Subtarget->hasGloballyAddressableScratch()) {
10071 AMDGPU::S_MOV_B32,
DL, MVT::i32,
10072 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10081 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10084 case Intrinsic::amdgcn_perm:
10086 Op.getOperand(2),
Op.getOperand(3));
10087 case Intrinsic::amdgcn_reloc_constant: {
10097 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10098 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10099 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10100 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10101 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10102 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10103 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10104 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10105 if (
Op.getOperand(4).getValueType() == MVT::i32)
10111 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10112 Op.getOperand(3), IndexKeyi32);
10114 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10115 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10116 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10117 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10118 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10119 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10120 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10121 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10122 if (
Op.getOperand(4).getValueType() == MVT::i64)
10128 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10129 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10130 Op.getOperand(6)});
10132 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10133 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10134 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10135 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10136 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10137 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10138 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10141 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
10147 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10148 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10149 IndexKey, Op.getOperand(7),
10150 Op.getOperand(8)});
10152 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10153 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10154 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10155 if (
Op.getOperand(6).getValueType() == MVT::i32)
10161 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10162 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10163 IndexKeyi32, Op.getOperand(7)});
10165 case Intrinsic::amdgcn_addrspacecast_nonnull:
10166 return lowerADDRSPACECAST(
Op, DAG);
10167 case Intrinsic::amdgcn_readlane:
10168 case Intrinsic::amdgcn_readfirstlane:
10169 case Intrinsic::amdgcn_writelane:
10170 case Intrinsic::amdgcn_permlane16:
10171 case Intrinsic::amdgcn_permlanex16:
10172 case Intrinsic::amdgcn_permlane64:
10173 case Intrinsic::amdgcn_set_inactive:
10174 case Intrinsic::amdgcn_set_inactive_chain_arg:
10175 case Intrinsic::amdgcn_mov_dpp8:
10176 case Intrinsic::amdgcn_update_dpp:
10178 case Intrinsic::amdgcn_dead: {
10180 for (
const EVT ValTy :
Op.getNode()->values())
10185 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10187 return lowerImage(
Op, ImageDimIntr, DAG,
false);
10198 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10204 unsigned NewOpcode)
const {
10208 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10209 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10227 M->getMemOperand());
10232 unsigned NewOpcode)
const {
10236 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10237 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10255 M->getMemOperand());
10260 unsigned IntrID =
Op.getConstantOperandVal(1);
10264 case Intrinsic::amdgcn_ds_ordered_add:
10265 case Intrinsic::amdgcn_ds_ordered_swap: {
10270 unsigned IndexOperand =
M->getConstantOperandVal(7);
10271 unsigned WaveRelease =
M->getConstantOperandVal(8);
10272 unsigned WaveDone =
M->getConstantOperandVal(9);
10274 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10275 IndexOperand &= ~0x3f;
10276 unsigned CountDw = 0;
10279 CountDw = (IndexOperand >> 24) & 0xf;
10280 IndexOperand &= ~(0xf << 24);
10282 if (CountDw < 1 || CountDw > 4) {
10285 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
10286 DL.getDebugLoc()));
10291 if (IndexOperand) {
10294 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
10297 if (WaveDone && !WaveRelease) {
10301 Fn,
"ds_ordered_count: wave_done requires wave_release",
10302 DL.getDebugLoc()));
10305 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10306 unsigned ShaderType =
10308 unsigned Offset0 = OrderedCountIndex << 2;
10309 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10312 Offset1 |= (CountDw - 1) << 6;
10315 Offset1 |= ShaderType << 2;
10317 unsigned Offset = Offset0 | (Offset1 << 8);
10324 M->getVTList(),
Ops,
M->getMemoryVT(),
10325 M->getMemOperand());
10327 case Intrinsic::amdgcn_raw_buffer_load:
10328 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10329 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10330 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10331 case Intrinsic::amdgcn_raw_buffer_load_format:
10332 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10333 const bool IsFormat =
10334 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10335 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10337 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10338 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10352 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
10354 case Intrinsic::amdgcn_struct_buffer_load:
10355 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10356 case Intrinsic::amdgcn_struct_buffer_load_format:
10357 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10358 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10359 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10360 const bool IsFormat =
10361 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10362 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10364 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10365 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10380 case Intrinsic::amdgcn_raw_tbuffer_load:
10381 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10383 EVT LoadVT =
Op.getValueType();
10384 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10385 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10404 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10407 case Intrinsic::amdgcn_struct_tbuffer_load:
10408 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10410 EVT LoadVT =
Op.getValueType();
10411 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10412 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10431 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10434 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10435 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10437 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10438 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10439 return lowerStructBufferAtomicIntrin(
Op, DAG,
10441 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10442 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10444 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10445 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10446 return lowerStructBufferAtomicIntrin(
Op, DAG,
10448 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10449 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10451 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10452 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10453 return lowerStructBufferAtomicIntrin(
Op, DAG,
10455 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10456 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10458 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10459 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10461 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10462 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10464 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10465 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10467 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10468 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10470 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10471 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10473 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10474 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10476 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10477 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10479 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10480 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10482 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10483 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10485 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10486 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10488 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10489 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10491 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10492 return lowerRawBufferAtomicIntrin(
Op, DAG,
10494 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10495 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10496 return lowerStructBufferAtomicIntrin(
Op, DAG,
10498 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10499 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10501 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10502 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10504 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10505 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10506 return lowerStructBufferAtomicIntrin(
Op, DAG,
10508 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10509 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10510 return lowerStructBufferAtomicIntrin(
Op, DAG,
10512 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10513 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10514 return lowerStructBufferAtomicIntrin(
Op, DAG,
10516 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10517 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10518 return lowerStructBufferAtomicIntrin(
Op, DAG,
10520 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10521 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10523 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10524 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10526 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10527 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10529 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10530 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10532 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10533 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10535 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10536 return lowerStructBufferAtomicIntrin(
Op, DAG,
10539 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10540 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10541 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
10542 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10556 EVT VT =
Op.getValueType();
10560 Op->getVTList(),
Ops, VT,
10561 M->getMemOperand());
10563 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10564 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10565 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
10566 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
10580 EVT VT =
Op.getValueType();
10584 Op->getVTList(),
Ops, VT,
10585 M->getMemOperand());
10587 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10588 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10590 SDValue NodePtr =
M->getOperand(2);
10591 SDValue RayExtent =
M->getOperand(3);
10592 SDValue InstanceMask =
M->getOperand(4);
10593 SDValue RayOrigin =
M->getOperand(5);
10594 SDValue RayDir =
M->getOperand(6);
10596 SDValue TDescr =
M->getOperand(8);
10601 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10606 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10607 const unsigned NumVDataDwords = 10;
10608 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10610 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10611 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10612 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10616 Ops.push_back(NodePtr);
10619 {DAG.getBitcast(MVT::i32, RayExtent),
10620 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10621 Ops.push_back(RayOrigin);
10622 Ops.push_back(RayDir);
10623 Ops.push_back(Offsets);
10624 Ops.push_back(TDescr);
10625 Ops.push_back(
M->getChain());
10628 MachineMemOperand *MemRef =
M->getMemOperand();
10632 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10634 SDValue NodePtr =
M->getOperand(2);
10635 SDValue RayExtent =
M->getOperand(3);
10636 SDValue RayOrigin =
M->getOperand(4);
10637 SDValue RayDir =
M->getOperand(5);
10638 SDValue RayInvDir =
M->getOperand(6);
10639 SDValue TDescr =
M->getOperand(7);
10646 if (!Subtarget->hasGFX10_AEncoding()) {
10656 const unsigned NumVDataDwords = 4;
10657 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10658 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10659 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10662 const unsigned BaseOpcodes[2][2] = {
10663 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10664 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10665 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10669 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10670 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10671 : AMDGPU::MIMGEncGfx10NSA,
10672 NumVDataDwords, NumVAddrDwords);
10676 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10677 : AMDGPU::MIMGEncGfx10Default,
10678 NumVDataDwords, NumVAddrDwords);
10684 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
10687 if (Lanes[0].getValueSizeInBits() == 32) {
10688 for (
unsigned I = 0;
I < 3; ++
I)
10695 Ops.push_back(Lanes[2]);
10707 if (UseNSA && IsGFX11Plus) {
10708 Ops.push_back(NodePtr);
10710 Ops.push_back(RayOrigin);
10715 for (
unsigned I = 0;
I < 3; ++
I) {
10718 {DirLanes[I], InvDirLanes[I]})));
10722 Ops.push_back(RayDir);
10723 Ops.push_back(RayInvDir);
10730 Ops.push_back(NodePtr);
10733 packLanes(RayOrigin,
true);
10734 packLanes(RayDir,
true);
10735 packLanes(RayInvDir,
false);
10740 if (NumVAddrDwords > 12) {
10742 Ops.append(16 -
Ops.size(), Undef);
10748 Ops.push_back(MergedOps);
10751 Ops.push_back(TDescr);
10753 Ops.push_back(
M->getChain());
10756 MachineMemOperand *MemRef =
M->getMemOperand();
10760 case Intrinsic::amdgcn_global_atomic_fmin_num:
10761 case Intrinsic::amdgcn_global_atomic_fmax_num:
10762 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10763 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10770 unsigned Opcode = 0;
10772 case Intrinsic::amdgcn_global_atomic_fmin_num:
10773 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10774 Opcode = ISD::ATOMIC_LOAD_FMIN;
10777 case Intrinsic::amdgcn_global_atomic_fmax_num:
10778 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10779 Opcode = ISD::ATOMIC_LOAD_FMAX;
10785 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
10786 Ops,
M->getMemOperand());
10788 case Intrinsic::amdgcn_s_get_barrier_state:
10789 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10796 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10797 BarID = (BarID >> 4) & 0x3F;
10798 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10801 Ops.push_back(Chain);
10803 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10804 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10812 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
10820 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10821 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10822 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10826 EVT VT =
Op->getValueType(0);
10832 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10834 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10842SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
10849 EVT VT = VTList.
VTs[0];
10852 bool IsTFE = VTList.
NumVTs == 3;
10855 unsigned NumOpDWords = NumValueDWords + 1;
10857 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
10858 MachineMemOperand *OpDWordsMMO =
10860 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
10861 OpDWordsVT, OpDWordsMMO, DAG);
10866 NumValueDWords == 1
10875 if (!Subtarget->hasDwordx3LoadStores() &&
10876 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10880 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
10882 WidenedMemVT, WidenedMMO);
10892 bool ImageStore)
const {
10902 if (Subtarget->hasUnpackedD16VMem()) {
10916 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10927 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
10933 if ((NumElements % 2) == 1) {
10935 unsigned I = Elts.
size() / 2;
10951 if (NumElements == 3) {
10961 return DAG.
getNode(ISD::BITCAST,
DL, WidenedStoreVT, ZExt);
10972 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
10975 switch (IntrinsicID) {
10976 case Intrinsic::amdgcn_exp_compr: {
10977 if (!Subtarget->hasCompressedExport()) {
10980 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
10992 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src0),
10993 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src1),
11002 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11006 case Intrinsic::amdgcn_struct_tbuffer_store:
11007 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11009 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11011 VData = handleD16VData(VData, DAG);
11012 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11013 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11031 M->getMemoryVT(),
M->getMemOperand());
11034 case Intrinsic::amdgcn_raw_tbuffer_store:
11035 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11037 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11039 VData = handleD16VData(VData, DAG);
11040 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11041 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11059 M->getMemoryVT(),
M->getMemOperand());
11062 case Intrinsic::amdgcn_raw_buffer_store:
11063 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11064 case Intrinsic::amdgcn_raw_buffer_store_format:
11065 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11066 const bool IsFormat =
11067 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11068 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11075 VData = handleD16VData(VData, DAG);
11085 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11086 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11106 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
11109 M->getMemoryVT(),
M->getMemOperand());
11112 case Intrinsic::amdgcn_struct_buffer_store:
11113 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11114 case Intrinsic::amdgcn_struct_buffer_store_format:
11115 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11116 const bool IsFormat =
11117 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11118 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11126 VData = handleD16VData(VData, DAG);
11136 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11137 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11156 EVT VDataType = VData.getValueType().getScalarType();
11158 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
11161 M->getMemoryVT(),
M->getMemOperand());
11163 case Intrinsic::amdgcn_raw_buffer_load_lds:
11164 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11165 case Intrinsic::amdgcn_struct_buffer_load_lds:
11166 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11167 if (!Subtarget->hasVMemToLDSLoad())
11171 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11172 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11173 unsigned OpOffset = HasVIndex ? 1 : 0;
11174 SDValue VOffset =
Op.getOperand(5 + OpOffset);
11176 unsigned Size =
Op->getConstantOperandVal(4);
11182 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11183 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11184 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11185 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11188 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11189 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11190 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11191 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11194 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11195 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11196 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11197 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11200 if (!Subtarget->hasLDSLoadB96_B128())
11202 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11203 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11204 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11205 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11208 if (!Subtarget->hasLDSLoadB96_B128())
11210 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11211 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11212 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11213 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11221 if (HasVIndex && HasVOffset)
11225 else if (HasVIndex)
11226 Ops.push_back(
Op.getOperand(5));
11227 else if (HasVOffset)
11228 Ops.push_back(VOffset);
11230 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11231 Ops.push_back(Rsrc);
11232 Ops.push_back(
Op.getOperand(6 + OpOffset));
11233 Ops.push_back(
Op.getOperand(7 + OpOffset));
11235 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
11248 MachineMemOperand *LoadMMO =
M->getMemOperand();
11253 MachinePointerInfo StorePtrI = LoadPtrI;
11277 case Intrinsic::amdgcn_load_to_lds:
11278 case Intrinsic::amdgcn_global_load_lds: {
11279 if (!Subtarget->hasVMemToLDSLoad())
11283 unsigned Size =
Op->getConstantOperandVal(4);
11288 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11291 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11294 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11297 if (!Subtarget->hasLDSLoadB96_B128())
11299 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11302 if (!Subtarget->hasLDSLoadB96_B128())
11304 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11320 if (
LHS->isDivergent())
11324 RHS.getOperand(0).getValueType() == MVT::i32) {
11327 VOffset =
RHS.getOperand(0);
11331 Ops.push_back(Addr);
11339 Ops.push_back(VOffset);
11342 Ops.push_back(
Op.getOperand(5));
11343 Ops.push_back(
Op.getOperand(6));
11348 MachineMemOperand *LoadMMO =
M->getMemOperand();
11350 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
11351 MachinePointerInfo StorePtrI = LoadPtrI;
11370 case Intrinsic::amdgcn_end_cf:
11372 Op->getOperand(2), Chain),
11374 case Intrinsic::amdgcn_s_barrier_init:
11375 case Intrinsic::amdgcn_s_barrier_signal_var: {
11382 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11383 ? AMDGPU::S_BARRIER_INIT_M0
11384 : AMDGPU::S_BARRIER_SIGNAL_M0;
11399 constexpr unsigned ShAmt = 16;
11406 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11411 case Intrinsic::amdgcn_s_barrier_join: {
11420 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11423 unsigned BarID = (BarVal >> 4) & 0x3F;
11426 Ops.push_back(Chain);
11428 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11438 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11444 case Intrinsic::amdgcn_s_prefetch_data: {
11447 return Op.getOperand(0);
11450 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11452 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
11459 Op->getVTList(),
Ops,
M->getMemoryVT(),
11460 M->getMemOperand());
11462 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11463 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11464 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11473 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11475 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11500std::pair<SDValue, SDValue>
11530 unsigned Overflow = ImmOffset & ~MaxImm;
11531 ImmOffset -= Overflow;
11532 if ((int32_t)Overflow < 0) {
11533 Overflow += ImmOffset;
11538 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
11557void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
11559 Align Alignment)
const {
11561 SDLoc
DL(CombinedOffset);
11563 uint32_t
Imm =
C->getZExtValue();
11564 uint32_t SOffset, ImmOffset;
11565 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11575 uint32_t SOffset, ImmOffset;
11578 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
11586 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11595SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
11598 return MaybePointer;
11612 SDValue NumRecords =
Op->getOperand(3);
11615 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11618 std::optional<uint32_t> ConstStride = std::nullopt;
11620 ConstStride = ConstNode->getZExtValue();
11623 if (!ConstStride || *ConstStride != 0) {
11626 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
11637 NewHighHalf, NumRecords, Flags);
11638 SDValue RsrcPtr = DAG.
getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11647 bool IsTFE)
const {
11656 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
11671 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
11675 LoadVal = DAG.
getNode(ISD::BITCAST,
DL, LoadVT, LoadVal);
11685 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11689 Ops[1] = BufferStoreExt;
11694 M->getMemOperand());
11719 DAGCombinerInfo &DCI)
const {
11720 SelectionDAG &DAG = DCI.DAG;
11735 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
11742 "unexpected vector extload");
11755 "unexpected fp extload");
11773 DCI.AddToWorklist(Cvt.
getNode());
11778 DCI.AddToWorklist(Cvt.
getNode());
11781 Cvt = DAG.
getNode(ISD::BITCAST, SL, VT, Cvt);
11789 if (
Info.isEntryFunction())
11790 return Info.getUserSGPRInfo().hasFlatScratchInit();
11798 EVT MemVT =
Load->getMemoryVT();
11799 MachineMemOperand *MMO =
Load->getMemOperand();
11811 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11839 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
11840 "Custom lowering for non-i32 vectors hasn't been implemented.");
11843 unsigned AS =
Load->getAddressSpace();
11850 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
11854 !Subtarget->hasMultiDwordFlatScratchAddressing())
11864 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
11867 Alignment >=
Align(4) && NumElements < 32) {
11869 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11881 if (NumElements > 4)
11884 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11894 switch (Subtarget->getMaxPrivateElementSize()) {
11900 if (NumElements > 2)
11905 if (NumElements > 4)
11908 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11917 auto Flags =
Load->getMemOperand()->getFlags();
11919 Load->getAlign(), Flags, &
Fast) &&
11928 MemVT, *
Load->getMemOperand())) {
11937 EVT VT =
Op.getValueType();
11964 return DAG.
getNode(ISD::BITCAST,
DL, VT, Res);
11974 EVT VT =
Op.getValueType();
11975 const SDNodeFlags
Flags =
Op->getFlags();
11977 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
11983 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11986 if (CLHS->isExactlyValue(1.0)) {
12003 if (CLHS->isExactlyValue(-1.0)) {
12012 if (!AllowInaccurateRcp &&
12013 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
12027 EVT VT =
Op.getValueType();
12028 const SDNodeFlags
Flags =
Op->getFlags();
12030 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
12031 if (!AllowInaccurateDiv)
12052 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
12066 return DAG.
getNode(Opcode, SL, VTList,
12075 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
12089 return DAG.
getNode(Opcode, SL, VTList,
12095 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12096 return FastLowered;
12099 EVT VT =
Op.getValueType();
12106 if (VT == MVT::bf16) {
12129 unsigned FMADOpCode =
12131 SDValue NegRHSExt = DAG.
getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12136 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12138 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
12139 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12145 Tmp = DAG.
getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12155 SDNodeFlags
Flags =
Op->getFlags();
12162 const APFloat K0Val(0x1p+96f);
12165 const APFloat K1Val(0x1p-32f);
12192 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
12193 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
12194 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12199 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12200 return FastLowered;
12206 SDNodeFlags
Flags =
Op->getFlags();
12207 Flags.setNoFPExcept(
true);
12215 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
12226 DAG.
getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12228 using namespace AMDGPU::Hwreg;
12229 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12233 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
12234 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
12237 const bool HasDynamicDenormals =
12243 if (!PreservesDenormals) {
12248 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12251 if (HasDynamicDenormals) {
12255 SavedDenormMode =
SDValue(GetReg, 0);
12261 SDNode *EnableDenorm;
12262 if (Subtarget->hasDenormModeInst()) {
12263 const SDValue EnableDenormValue =
12270 const SDValue EnableDenormValue =
12272 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12273 {EnableDenormValue,
BitField, Glue});
12283 ApproxRcp, One, NegDivScale0, Flags);
12286 ApproxRcp, Fma0, Flags);
12292 NumeratorScaled,
Mul, Flags);
12298 NumeratorScaled, Fma3, Flags);
12300 if (!PreservesDenormals) {
12301 SDNode *DisableDenorm;
12302 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12306 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12312 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
12313 const SDValue DisableDenormValue =
12314 HasDynamicDenormals
12319 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12330 {Fma4, Fma1, Fma3, Scale},
Flags);
12336 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
12337 return FastLowered;
12345 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
12349 SDValue NegDivScale0 = DAG.
getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12369 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12378 SDValue Scale0BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12379 SDValue Scale1BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12405 EVT VT =
Op.getValueType();
12407 if (VT == MVT::f32)
12408 return LowerFDIV32(
Op, DAG);
12410 if (VT == MVT::f64)
12411 return LowerFDIV64(
Op, DAG);
12413 if (VT == MVT::f16 || VT == MVT::bf16)
12414 return LowerFDIV16(
Op, DAG);
12423 EVT ResultExpVT =
Op->getValueType(1);
12424 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12434 if (Subtarget->hasFractBug()) {
12452 EVT VT =
Store->getMemoryVT();
12454 if (VT == MVT::i1) {
12458 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
12462 Store->getValue().getValueType().getScalarType() == MVT::i32);
12464 unsigned AS =
Store->getAddressSpace();
12472 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12476 !Subtarget->hasMultiDwordFlatScratchAddressing())
12483 if (NumElements > 4)
12486 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12490 VT, *
Store->getMemOperand()))
12496 switch (Subtarget->getMaxPrivateElementSize()) {
12500 if (NumElements > 2)
12504 if (NumElements > 4 ||
12505 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12513 auto Flags =
Store->getMemOperand()->getFlags();
12532 assert(!Subtarget->has16BitInsts());
12533 SDNodeFlags
Flags =
Op->getFlags();
12535 DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32,
Op.getOperand(0), Flags);
12547 SDNodeFlags
Flags =
Op->getFlags();
12548 MVT VT =
Op.getValueType().getSimpleVT();
12578 SDValue SqrtSNextDown = DAG.
getNode(ISD::BITCAST,
DL, VT, SqrtSNextDownInt);
12581 DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextDown, Flags);
12590 SDValue NegSqrtSNextUp = DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextUp, Flags);
12656 SDNodeFlags
Flags =
Op->getFlags();
12702 SqrtRet = DAG.
getNode(ISD::FLDEXP,
DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12719 EVT VT =
Op.getValueType();
12729 if (Subtarget->hasTrigReducedRange()) {
12736 switch (
Op.getOpcode()) {
12763 EVT VT =
Op.getValueType();
12771 Op->getVTList(),
Ops, VT,
12780SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
12781 DAGCombinerInfo &DCI)
const {
12782 EVT VT =
N->getValueType(0);
12784 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12787 SelectionDAG &DAG = DCI.DAG;
12791 EVT SrcVT = Src.getValueType();
12797 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12800 DCI.AddToWorklist(Cvt.
getNode());
12803 if (ScalarVT != MVT::f32) {
12815 DAGCombinerInfo &DCI)
const {
12822 if (SignOp.
getOpcode() == ISD::FP_EXTEND ||
12826 SelectionDAG &DAG = DCI.DAG;
12845 for (
unsigned I = 0;
I != NumElts; ++
I) {
12869 if (NewElts.
size() == 1)
12891 for (
unsigned I = 0;
I != NumElts; ++
I) {
12926SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
12928 DAGCombinerInfo &DCI)
const {
12946 SelectionDAG &DAG = DCI.DAG;
12959 AM.BaseOffs =
Offset.getSExtValue();
12964 EVT VT =
N->getValueType(0);
12970 Flags.setNoUnsignedWrap(
12971 N->getFlags().hasNoUnsignedWrap() &&
12981 switch (
N->getOpcode()) {
12992 DAGCombinerInfo &DCI)
const {
12993 SelectionDAG &DAG = DCI.DAG;
13000 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
13001 N->getMemoryVT(), DCI);
13005 NewOps[PtrIdx] = NewPtr;
13014 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13015 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13024SDValue SITargetLowering::splitBinaryBitConstantOp(
13028 uint32_t ValLo =
Lo_32(Val);
13029 uint32_t ValHi =
Hi_32(Val);
13036 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
13050 if (V.getValueType() != MVT::i1)
13052 switch (V.getOpcode()) {
13069 return V.getResNo() == 1;
13071 unsigned IntrinsicID = V.getConstantOperandVal(0);
13072 switch (IntrinsicID) {
13073 case Intrinsic::amdgcn_is_shared:
13074 case Intrinsic::amdgcn_is_private:
13091 if (!(
C & 0x000000ff))
13092 ZeroByteMask |= 0x000000ff;
13093 if (!(
C & 0x0000ff00))
13094 ZeroByteMask |= 0x0000ff00;
13095 if (!(
C & 0x00ff0000))
13096 ZeroByteMask |= 0x00ff0000;
13097 if (!(
C & 0xff000000))
13098 ZeroByteMask |= 0xff000000;
13099 uint32_t NonZeroByteMask = ~ZeroByteMask;
13100 if ((NonZeroByteMask &
C) != NonZeroByteMask)
13113 assert(V.getValueSizeInBits() == 32);
13115 if (V.getNumOperands() != 2)
13124 switch (V.getOpcode()) {
13129 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13134 return (0x03020100 & ~ConstMask) | ConstMask;
13141 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
13147 return uint32_t(0x0c0c0c0c03020100ull >>
C);
13154 DAGCombinerInfo &DCI)
const {
13155 if (DCI.isBeforeLegalize())
13158 SelectionDAG &DAG = DCI.DAG;
13159 EVT VT =
N->getValueType(0);
13164 if (VT == MVT::i64 && CRHS) {
13166 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
13170 if (CRHS && VT == MVT::i32) {
13180 unsigned Shift = CShift->getZExtValue();
13182 unsigned Offset = NB + Shift;
13183 if ((
Offset & (Bits - 1)) == 0) {
13207 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13222 if (
Y.getOpcode() != ISD::FABS ||
Y.getOperand(0) !=
X ||
13227 if (
X !=
LHS.getOperand(1))
13231 const ConstantFPSDNode *C1 =
13265 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
13266 LHS.getOperand(0) ==
LHS.getOperand(1))) {
13268 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
13269 :
Mask->getZExtValue() & OrdMask;
13290 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13293 if (LHSMask != ~0u && RHSMask != ~0u) {
13296 if (LHSMask > RHSMask) {
13303 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13304 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13307 if (!(LHSUsedLanes & RHSUsedLanes) &&
13310 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13316 uint32_t
Mask = LHSMask & RHSMask;
13317 for (
unsigned I = 0;
I < 32;
I += 8) {
13318 uint32_t ByteSel = 0xff <<
I;
13319 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13320 Mask &= (0x0c <<
I) & 0xffffffff;
13325 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
13378static const std::optional<ByteProvider<SDValue>>
13380 unsigned Depth = 0) {
13383 return std::nullopt;
13385 if (
Op.getValueSizeInBits() < 8)
13386 return std::nullopt;
13388 if (
Op.getValueType().isVector())
13391 switch (
Op->getOpcode()) {
13403 NarrowVT = VTSign->getVT();
13406 return std::nullopt;
13409 if (SrcIndex >= NarrowByteWidth)
13410 return std::nullopt;
13418 return std::nullopt;
13420 uint64_t BitShift = ShiftOp->getZExtValue();
13422 if (BitShift % 8 != 0)
13423 return std::nullopt;
13425 SrcIndex += BitShift / 8;
13443static const std::optional<ByteProvider<SDValue>>
13445 unsigned StartingIndex = 0) {
13449 return std::nullopt;
13451 unsigned BitWidth =
Op.getScalarValueSizeInBits();
13453 return std::nullopt;
13455 return std::nullopt;
13457 bool IsVec =
Op.getValueType().isVector();
13458 switch (
Op.getOpcode()) {
13461 return std::nullopt;
13466 return std::nullopt;
13470 return std::nullopt;
13473 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
13474 return std::nullopt;
13475 if (!
LHS ||
LHS->isConstantZero())
13477 if (!
RHS ||
RHS->isConstantZero())
13479 return std::nullopt;
13484 return std::nullopt;
13488 return std::nullopt;
13490 uint32_t BitMask = BitMaskOp->getZExtValue();
13492 uint32_t IndexMask = 0xFF << (Index * 8);
13494 if ((IndexMask & BitMask) != IndexMask) {
13497 if (IndexMask & BitMask)
13498 return std::nullopt;
13507 return std::nullopt;
13511 if (!ShiftOp ||
Op.getValueType().isVector())
13512 return std::nullopt;
13514 uint64_t BitsProvided =
Op.getValueSizeInBits();
13515 if (BitsProvided % 8 != 0)
13516 return std::nullopt;
13518 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13520 return std::nullopt;
13522 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13523 uint64_t ByteShift = BitShift / 8;
13525 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13526 uint64_t BytesProvided = BitsProvided / 8;
13527 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13528 NewIndex %= BytesProvided;
13535 return std::nullopt;
13539 return std::nullopt;
13541 uint64_t BitShift = ShiftOp->getZExtValue();
13543 return std::nullopt;
13545 auto BitsProvided =
Op.getScalarValueSizeInBits();
13546 if (BitsProvided % 8 != 0)
13547 return std::nullopt;
13549 uint64_t BytesProvided = BitsProvided / 8;
13550 uint64_t ByteShift = BitShift / 8;
13555 return BytesProvided - ByteShift > Index
13563 return std::nullopt;
13567 return std::nullopt;
13569 uint64_t BitShift = ShiftOp->getZExtValue();
13570 if (BitShift % 8 != 0)
13571 return std::nullopt;
13572 uint64_t ByteShift = BitShift / 8;
13578 return Index < ByteShift
13581 Depth + 1, StartingIndex);
13590 return std::nullopt;
13598 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13600 if (NarrowBitWidth % 8 != 0)
13601 return std::nullopt;
13602 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13604 if (Index >= NarrowByteWidth)
13606 ? std::optional<ByteProvider<SDValue>>(
13614 return std::nullopt;
13618 if (NarrowByteWidth >= Index) {
13623 return std::nullopt;
13630 return std::nullopt;
13636 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13637 if (NarrowBitWidth % 8 != 0)
13638 return std::nullopt;
13639 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13644 if (Index >= NarrowByteWidth) {
13646 ? std::optional<ByteProvider<SDValue>>(
13651 if (NarrowByteWidth > Index) {
13655 return std::nullopt;
13660 return std::nullopt;
13663 Depth + 1, StartingIndex);
13669 return std::nullopt;
13670 auto VecIdx = IdxOp->getZExtValue();
13671 auto ScalarSize =
Op.getScalarValueSizeInBits();
13672 if (ScalarSize < 32)
13673 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13675 StartingIndex, Index);
13680 return std::nullopt;
13684 return std::nullopt;
13687 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13688 if (IdxMask > 0x07 && IdxMask != 0x0c)
13689 return std::nullopt;
13691 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13692 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13694 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
13700 return std::nullopt;
13715 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
13722 auto MemVT = L->getMemoryVT();
13725 return L->getMemoryVT().getSizeInBits() == 16;
13735 int Low8 = Mask & 0xff;
13736 int Hi8 = (Mask & 0xff00) >> 8;
13738 assert(Low8 < 8 && Hi8 < 8);
13740 bool IsConsecutive = (Hi8 - Low8 == 1);
13745 bool Is16Aligned = !(Low8 % 2);
13747 return IsConsecutive && Is16Aligned;
13755 int Low16 = PermMask & 0xffff;
13756 int Hi16 = (PermMask & 0xffff0000) >> 16;
13766 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13768 if (!OtherOpIs16Bit)
13776 unsigned DWordOffset) {
13781 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13786 if (Src.getValueType().isVector()) {
13787 auto ScalarTySize = Src.getScalarValueSizeInBits();
13788 auto ScalarTy = Src.getValueType().getScalarType();
13789 if (ScalarTySize == 32) {
13793 if (ScalarTySize > 32) {
13796 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13797 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13804 assert(ScalarTySize < 32);
13805 auto NumElements =
TypeSize / ScalarTySize;
13806 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13807 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13808 auto NumElementsIn32 = 32 / ScalarTySize;
13809 auto NumAvailElements = DWordOffset < Trunc32Elements
13811 : NumElements - NormalizedTrunc;
13824 auto ShiftVal = 32 * DWordOffset;
13832 [[maybe_unused]]
EVT VT =
N->getValueType(0);
13837 for (
int i = 0; i < 4; i++) {
13839 std::optional<ByteProvider<SDValue>>
P =
13842 if (!
P ||
P->isConstantZero())
13847 if (PermNodes.
size() != 4)
13850 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13851 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13853 for (
size_t i = 0; i < PermNodes.
size(); i++) {
13854 auto PermOp = PermNodes[i];
13857 int SrcByteAdjust = 4;
13861 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13862 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13864 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13865 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13869 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13870 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13873 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13875 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13878 SDValue Op = *PermNodes[FirstSrc.first].Src;
13880 assert(
Op.getValueSizeInBits() == 32);
13884 int Low16 = PermMask & 0xffff;
13885 int Hi16 = (PermMask & 0xffff0000) >> 16;
13887 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13888 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13891 if (WellFormedLow && WellFormedHi)
13895 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
13904 assert(
Op.getValueType().isByteSized() &&
13922 DAGCombinerInfo &DCI)
const {
13923 SelectionDAG &DAG = DCI.DAG;
13927 EVT VT =
N->getValueType(0);
13928 if (VT == MVT::i1) {
13933 if (Src !=
RHS.getOperand(0))
13938 if (!CLHS || !CRHS)
13942 static const uint32_t MaxMask = 0x3ff;
13962 Sel |=
LHS.getConstantOperandVal(2);
13971 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13975 auto usesCombinedOperand = [](SDNode *OrUse) {
13977 if (OrUse->getOpcode() != ISD::BITCAST ||
13978 !OrUse->getValueType(0).isVector())
13982 for (
auto *VUser : OrUse->users()) {
13983 if (!VUser->getValueType(0).isVector())
13990 if (VUser->getOpcode() == VectorwiseOp)
13996 if (!
any_of(
N->users(), usesCombinedOperand))
14002 if (LHSMask != ~0u && RHSMask != ~0u) {
14005 if (LHSMask > RHSMask) {
14012 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14013 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14016 if (!(LHSUsedLanes & RHSUsedLanes) &&
14019 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14021 LHSMask &= ~RHSUsedLanes;
14022 RHSMask &= ~LHSUsedLanes;
14024 LHSMask |= LHSUsedLanes & 0x04040404;
14026 uint32_t Sel = LHSMask | RHSMask;
14034 if (LHSMask == ~0u || RHSMask == ~0u) {
14075 return IdentitySrc;
14081 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14096 if (SrcVT == MVT::i32) {
14101 DCI.AddToWorklist(LowOr.
getNode());
14102 DCI.AddToWorklist(HiBits.getNode());
14106 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14113 N->getOperand(0), CRHS))
14121 DAGCombinerInfo &DCI)
const {
14122 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
14129 SelectionDAG &DAG = DCI.DAG;
14131 EVT VT =
N->getValueType(0);
14132 if (CRHS && VT == MVT::i64) {
14134 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
14141 unsigned Opc =
LHS.getOpcode();
14165 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(1));
14167 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(2));
14171 LHS->getOperand(0), FNegLHS, FNegRHS);
14172 return DAG.
getNode(ISD::BITCAST,
DL, VT, NewSelect);
14180 DAGCombinerInfo &DCI)
const {
14181 if (!Subtarget->has16BitInsts() ||
14185 EVT VT =
N->getValueType(0);
14186 if (VT != MVT::i32)
14190 if (Src.getValueType() != MVT::i16)
14197SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
14198 DAGCombinerInfo &DCI)
const {
14205 VTSign->getVT() == MVT::i8) ||
14207 VTSign->getVT() == MVT::i16))) {
14208 assert(Subtarget->hasScalarSubwordLoads() &&
14209 "s_buffer_load_{u8, i8} are supported "
14210 "in GFX12 (or newer) architectures.");
14211 EVT VT = Src.getValueType();
14216 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14223 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14224 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14229 VTSign->getVT() == MVT::i8) ||
14231 VTSign->getVT() == MVT::i16)) &&
14240 Src.getOperand(6), Src.getOperand(7)};
14243 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14247 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14248 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14249 return DCI.DAG.getMergeValues(
14250 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
14256 DAGCombinerInfo &DCI)
const {
14257 SelectionDAG &DAG = DCI.DAG;
14264 if (
N->getOperand(0).isUndef())
14271 DAGCombinerInfo &DCI)
const {
14272 EVT VT =
N->getValueType(0);
14287 if ((VT == MVT::f16 && N0.
getOpcode() == ISD::FSQRT) &&
14297 unsigned MaxDepth)
const {
14298 unsigned Opcode =
Op.getOpcode();
14303 const auto &
F = CFP->getValueAPF();
14304 if (
F.isNaN() &&
F.isSignaling())
14306 if (!
F.isDenormal())
14332 case ISD::FP_EXTEND:
14333 case ISD::FP16_TO_FP:
14334 case ISD::FP_TO_FP16:
14335 case ISD::BF16_TO_FP:
14336 case ISD::FP_TO_BF16:
14369 if (
Op.getValueType() == MVT::i32) {
14375 if (RHS->getZExtValue() == 0xffff0000) {
14385 return Op.getValueType().getScalarType() != MVT::f16;
14389 case ISD::FMINNUM_IEEE:
14390 case ISD::FMAXNUM_IEEE:
14391 case ISD::FMINIMUM:
14392 case ISD::FMAXIMUM:
14393 case ISD::FMINIMUMNUM:
14394 case ISD::FMAXIMUMNUM:
14406 if (Subtarget->supportsMinMaxDenormModes() ||
14416 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
14428 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
14455 if (
Op.getValueType() == MVT::i16) {
14458 TruncSrc.
getOpcode() == ISD::BITCAST &&
14466 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
14468 switch (IntrinsicID) {
14469 case Intrinsic::amdgcn_cvt_pkrtz:
14470 case Intrinsic::amdgcn_cubeid:
14471 case Intrinsic::amdgcn_frexp_mant:
14472 case Intrinsic::amdgcn_fdot2:
14473 case Intrinsic::amdgcn_rcp:
14474 case Intrinsic::amdgcn_rsq:
14475 case Intrinsic::amdgcn_rsq_clamp:
14476 case Intrinsic::amdgcn_rcp_legacy:
14477 case Intrinsic::amdgcn_rsq_legacy:
14478 case Intrinsic::amdgcn_trig_preop:
14479 case Intrinsic::amdgcn_tanh:
14480 case Intrinsic::amdgcn_log:
14481 case Intrinsic::amdgcn_exp2:
14482 case Intrinsic::amdgcn_sqrt:
14500 unsigned MaxDepth)
const {
14503 unsigned Opcode =
MI->getOpcode();
14505 if (Opcode == AMDGPU::G_FCANONICALIZE)
14508 std::optional<FPValueAndVReg> FCR;
14511 if (FCR->Value.isSignaling())
14513 if (!FCR->Value.isDenormal())
14524 case AMDGPU::G_FADD:
14525 case AMDGPU::G_FSUB:
14526 case AMDGPU::G_FMUL:
14527 case AMDGPU::G_FCEIL:
14528 case AMDGPU::G_FFLOOR:
14529 case AMDGPU::G_FRINT:
14530 case AMDGPU::G_FNEARBYINT:
14531 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14532 case AMDGPU::G_INTRINSIC_TRUNC:
14533 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14534 case AMDGPU::G_FMA:
14535 case AMDGPU::G_FMAD:
14536 case AMDGPU::G_FSQRT:
14537 case AMDGPU::G_FDIV:
14538 case AMDGPU::G_FREM:
14539 case AMDGPU::G_FPOW:
14540 case AMDGPU::G_FPEXT:
14541 case AMDGPU::G_FLOG:
14542 case AMDGPU::G_FLOG2:
14543 case AMDGPU::G_FLOG10:
14544 case AMDGPU::G_FPTRUNC:
14545 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14546 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14547 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14548 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14549 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14551 case AMDGPU::G_FNEG:
14552 case AMDGPU::G_FABS:
14553 case AMDGPU::G_FCOPYSIGN:
14555 case AMDGPU::G_FMINNUM:
14556 case AMDGPU::G_FMAXNUM:
14557 case AMDGPU::G_FMINNUM_IEEE:
14558 case AMDGPU::G_FMAXNUM_IEEE:
14559 case AMDGPU::G_FMINIMUM:
14560 case AMDGPU::G_FMAXIMUM:
14561 case AMDGPU::G_FMINIMUMNUM:
14562 case AMDGPU::G_FMAXIMUMNUM: {
14563 if (Subtarget->supportsMinMaxDenormModes() ||
14570 case AMDGPU::G_BUILD_VECTOR:
14575 case AMDGPU::G_INTRINSIC:
14576 case AMDGPU::G_INTRINSIC_CONVERGENT:
14578 case Intrinsic::amdgcn_fmul_legacy:
14579 case Intrinsic::amdgcn_fmad_ftz:
14580 case Intrinsic::amdgcn_sqrt:
14581 case Intrinsic::amdgcn_fmed3:
14582 case Intrinsic::amdgcn_sin:
14583 case Intrinsic::amdgcn_cos:
14584 case Intrinsic::amdgcn_log:
14585 case Intrinsic::amdgcn_exp2:
14586 case Intrinsic::amdgcn_log_clamp:
14587 case Intrinsic::amdgcn_rcp:
14588 case Intrinsic::amdgcn_rcp_legacy:
14589 case Intrinsic::amdgcn_rsq:
14590 case Intrinsic::amdgcn_rsq_clamp:
14591 case Intrinsic::amdgcn_rsq_legacy:
14592 case Intrinsic::amdgcn_div_scale:
14593 case Intrinsic::amdgcn_div_fmas:
14594 case Intrinsic::amdgcn_div_fixup:
14595 case Intrinsic::amdgcn_fract:
14596 case Intrinsic::amdgcn_cvt_pkrtz:
14597 case Intrinsic::amdgcn_cubeid:
14598 case Intrinsic::amdgcn_cubema:
14599 case Intrinsic::amdgcn_cubesc:
14600 case Intrinsic::amdgcn_cubetc:
14601 case Intrinsic::amdgcn_frexp_mant:
14602 case Intrinsic::amdgcn_fdot2:
14603 case Intrinsic::amdgcn_trig_preop:
14604 case Intrinsic::amdgcn_tanh:
14623 if (
C.isDenormal()) {
14637 if (
C.isSignaling()) {
14660SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
14661 DAGCombinerInfo &DCI)
const {
14662 SelectionDAG &DAG = DCI.DAG;
14664 EVT VT =
N->getValueType(0);
14673 EVT VT =
N->getValueType(0);
14674 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
14690 EVT EltVT =
Lo.getValueType();
14693 for (
unsigned I = 0;
I != 2; ++
I) {
14697 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14698 }
else if (
Op.isUndef()) {
14732 case ISD::FMAXNUM_IEEE:
14733 case ISD::FMAXIMUMNUM:
14735 case ISD::FMAXIMUM:
14742 case ISD::FMINNUM_IEEE:
14743 case ISD::FMINIMUMNUM:
14745 case ISD::FMINIMUM:
14771 if (!MinK || !MaxK)
14784 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14785 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14844 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
14850 if (
Info->getMode().DX10Clamp) {
14859 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14887 case ISD::FMINNUM_IEEE:
14888 case ISD::FMAXNUM_IEEE:
14889 case ISD::FMINIMUMNUM:
14890 case ISD::FMAXIMUMNUM:
14893 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
14895 case ISD::FMINIMUM:
14896 case ISD::FMAXIMUM:
14904 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
14913 DAGCombinerInfo &DCI)
const {
14914 SelectionDAG &DAG = DCI.DAG;
14946 if (
SDValue Med3 = performIntMed3ImmCombine(
14951 if (
SDValue Med3 = performIntMed3ImmCombine(
14957 if (
SDValue Med3 = performIntMed3ImmCombine(
14962 if (
SDValue Med3 = performIntMed3ImmCombine(
14972 if (((
Opc == ISD::FMINNUM && Op0.
getOpcode() == ISD::FMAXNUM) ||
14973 (
Opc == ISD::FMINNUM_IEEE && Op0.
getOpcode() == ISD::FMAXNUM_IEEE) ||
14974 (
Opc == ISD::FMINIMUMNUM && Op0.
getOpcode() == ISD::FMAXIMUMNUM) ||
14977 (VT == MVT::f32 || VT == MVT::f64 ||
14978 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14979 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14980 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14981 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14983 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1))
14990 const SDNodeFlags
Flags =
N->getFlags();
14991 if ((
Opc == ISD::FMINIMUM ||
Opc == ISD::FMAXIMUM) &&
14992 !Subtarget->hasIEEEMinimumMaximumInsts() &&
Flags.hasNoNaNs()) {
14994 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14995 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
15005 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15006 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15015 DAGCombinerInfo &DCI)
const {
15016 EVT VT =
N->getValueType(0);
15020 SelectionDAG &DAG = DCI.DAG;
15035 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15039 if (
Info->getMode().DX10Clamp) {
15059 DAGCombinerInfo &DCI)
const {
15063 return DCI.DAG.getUNDEF(
N->getValueType(0));
15071 bool IsDivergentIdx,
15076 unsigned VecSize = EltSize * NumElem;
15079 if (VecSize <= 64 && EltSize < 32)
15088 if (IsDivergentIdx)
15092 unsigned NumInsts = NumElem +
15093 ((EltSize + 31) / 32) * NumElem ;
15097 if (Subtarget->useVGPRIndexMode())
15098 return NumInsts <= 16;
15102 if (Subtarget->hasMovrel())
15103 return NumInsts <= 15;
15109 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
15124SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
15125 DAGCombinerInfo &DCI)
const {
15131 EVT ResVT =
N->getValueType(0);
15155 if (!
C ||
C->getZExtValue() != 0x1f)
15171 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15190 case ISD::FMAXNUM_IEEE:
15191 case ISD::FMINNUM_IEEE:
15192 case ISD::FMAXIMUM:
15193 case ISD::FMINIMUM: {
15199 DCI.AddToWorklist(Elt0.
getNode());
15200 DCI.AddToWorklist(Elt1.
getNode());
15222 if (!DCI.isBeforeLegalize())
15230 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15233 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15234 unsigned EltIdx = BitIndex / 32;
15235 unsigned LeftoverBitIdx = BitIndex % 32;
15239 DCI.AddToWorklist(Cast.
getNode());
15243 DCI.AddToWorklist(Elt.
getNode());
15246 DCI.AddToWorklist(Srl.
getNode());
15250 DCI.AddToWorklist(Trunc.
getNode());
15252 if (VecEltVT == ResVT) {
15253 return DAG.
getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15264SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
15265 DAGCombinerInfo &DCI)
const {
15276 SelectionDAG &DAG = DCI.DAG;
15295 if (Src.getOpcode() == ISD::FP_EXTEND &&
15296 Src.getOperand(0).getValueType() == MVT::f16) {
15297 return Src.getOperand(0);
15301 APFloat Val = CFP->getValueAPF();
15302 bool LosesInfo =
true;
15312 DAGCombinerInfo &DCI)
const {
15313 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15314 "combine only useful on gfx8");
15316 SDValue TruncSrc =
N->getOperand(0);
15317 EVT VT =
N->getValueType(0);
15318 if (VT != MVT::f16)
15325 SelectionDAG &DAG = DCI.DAG;
15353 return DAG.
getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15356unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
15358 const SDNode *N1)
const {
15363 if (((VT == MVT::f32 &&
15365 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15385 EVT VT =
N->getValueType(0);
15386 if (VT != MVT::i32 && VT != MVT::i64)
15392 unsigned Opc =
N->getOpcode();
15447 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
15466 DAGCombinerInfo &DCI)
const {
15469 SelectionDAG &DAG = DCI.DAG;
15470 EVT VT =
N->getValueType(0);
15480 if (!
N->isDivergent() && Subtarget->hasSMulHi())
15484 if (NumBits <= 32 || NumBits > 64)
15495 if (!Subtarget->hasFullRate64Ops()) {
15496 unsigned NumUsers = 0;
15497 for (SDNode *User :
LHS->
users()) {
15500 if (!
User->isAnyAdd())
15524 bool MulSignedLo =
false;
15525 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15534 if (VT != MVT::i64) {
15557 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15559 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15560 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15562 if (!MulLHSUnsigned32) {
15569 if (!MulRHSUnsigned32) {
15580 if (VT != MVT::i64)
15586SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
15587 DAGCombinerInfo &DCI)
const {
15597 SelectionDAG &DAG = DCI.DAG;
15612 unsigned Opcode =
N->getOpcode();
15613 if (Opcode == ISD::PTRADD)
15616 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
15627static std::optional<ByteProvider<SDValue>>
15630 if (!Byte0 || Byte0->isConstantZero()) {
15631 return std::nullopt;
15634 if (Byte1 && !Byte1->isConstantZero()) {
15635 return std::nullopt;
15641 unsigned FirstCs =
First & 0x0c0c0c0c;
15642 unsigned SecondCs = Second & 0x0c0c0c0c;
15643 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
15644 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15646 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15647 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15648 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15649 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15651 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15675 for (
int BPI = 0; BPI < 2; BPI++) {
15678 BPP = {Src1, Src0};
15680 unsigned ZeroMask = 0x0c0c0c0c;
15681 unsigned FMask = 0xFF << (8 * (3 - Step));
15683 unsigned FirstMask =
15684 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15685 unsigned SecondMask =
15686 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15690 int FirstGroup = -1;
15691 for (
int I = 0;
I < 2;
I++) {
15693 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
15694 return IterElt.SrcOp == *BPP.first.Src &&
15695 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15699 if (Match != Srcs.
end()) {
15700 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
15705 if (FirstGroup != -1) {
15707 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
15708 return IterElt.SrcOp == *BPP.second.Src &&
15709 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15712 if (Match != Srcs.
end()) {
15713 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
15715 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15723 unsigned ZeroMask = 0x0c0c0c0c;
15724 unsigned FMask = 0xFF << (8 * (3 - Step));
15728 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15732 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15741 if (Srcs.
size() == 1) {
15742 auto *Elt = Srcs.
begin();
15746 if (Elt->PermMask == 0x3020100)
15753 auto *FirstElt = Srcs.
begin();
15754 auto *SecondElt = std::next(FirstElt);
15761 auto FirstMask = FirstElt->PermMask;
15762 auto SecondMask = SecondElt->PermMask;
15764 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15765 unsigned FirstPlusFour = FirstMask | 0x04040404;
15768 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15780 FirstElt = std::next(SecondElt);
15781 if (FirstElt == Srcs.
end())
15784 SecondElt = std::next(FirstElt);
15787 if (SecondElt == Srcs.
end()) {
15793 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
15799 return Perms.
size() == 2
15805 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15806 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15807 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15808 EntryMask += ZeroMask;
15813 auto Opcode =
Op.getOpcode();
15819static std::optional<bool>
15830 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15833 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15835 assert(!(S0IsUnsigned && S0IsSigned));
15836 assert(!(S1IsUnsigned && S1IsSigned));
15844 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15850 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15851 return std::nullopt;
15863 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15864 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15869 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15875 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15876 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15877 return std::nullopt;
15883 DAGCombinerInfo &DCI)
const {
15884 SelectionDAG &DAG = DCI.DAG;
15885 EVT VT =
N->getValueType(0);
15891 if (Subtarget->hasMad64_32()) {
15892 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
15897 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
15901 if (VT == MVT::i64) {
15902 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15907 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15909 std::optional<bool> IsSigned;
15915 int ChainLength = 0;
15916 for (
int I = 0;
I < 4;
I++) {
15920 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15923 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15928 TempNode->getOperand(MulIdx), *Src0, *Src1,
15929 TempNode->getOperand(MulIdx)->getOperand(0),
15930 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15934 IsSigned = *IterIsSigned;
15935 if (*IterIsSigned != *IsSigned)
15938 auto AddIdx = 1 - MulIdx;
15941 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
15942 Src2s.
push_back(TempNode->getOperand(AddIdx));
15952 TempNode->getOperand(AddIdx), *Src0, *Src1,
15953 TempNode->getOperand(AddIdx)->getOperand(0),
15954 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15958 if (*IterIsSigned != *IsSigned)
15962 ChainLength =
I + 2;
15966 TempNode = TempNode->getOperand(AddIdx);
15968 ChainLength =
I + 1;
15969 if (TempNode->getNumOperands() < 2)
15971 LHS = TempNode->getOperand(0);
15972 RHS = TempNode->getOperand(1);
15975 if (ChainLength < 2)
15981 if (ChainLength < 4) {
15991 bool UseOriginalSrc =
false;
15992 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
15993 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
15994 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
15995 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
15996 SmallVector<unsigned, 4> SrcBytes;
15997 auto Src0Mask = Src0s.
begin()->PermMask;
15998 SrcBytes.
push_back(Src0Mask & 0xFF000000);
15999 bool UniqueEntries =
true;
16000 for (
auto I = 1;
I < 4;
I++) {
16001 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
16004 UniqueEntries =
false;
16010 if (UniqueEntries) {
16011 UseOriginalSrc =
true;
16013 auto *FirstElt = Src0s.
begin();
16017 auto *SecondElt = Src1s.
begin();
16019 SecondElt->DWordOffset);
16028 if (!UseOriginalSrc) {
16035 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16038 : Intrinsic::amdgcn_udot4,
16048 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16053 unsigned Opc =
LHS.getOpcode();
16065 auto Cond =
RHS.getOperand(0);
16070 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16087 DAGCombinerInfo &DCI)
const {
16088 SelectionDAG &DAG = DCI.DAG;
16090 EVT VT =
N->getValueType(0);
16103 SDNodeFlags ShlFlags = N1->
getFlags();
16107 SDNodeFlags NewShlFlags =
16112 DCI.AddToWorklist(Inner.
getNode());
16119 if (Subtarget->hasMad64_32()) {
16120 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16129 if (VT == MVT::i64) {
16130 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16138 if (
const GlobalAddressSDNode *GA =
16143 SDNodeFlags
Flags =
16146 DCI.AddToWorklist(Inner.
getNode());
16174 SDNodeFlags ReassocFlags =
16177 if (ZIsConstant != YIsConstant) {
16181 DCI.AddToWorklist(Inner.
getNode());
16189 assert(!YIsConstant && !ZIsConstant);
16191 if (!
X->isDivergent() &&
Y->isDivergent() !=
Z->isDivergent()) {
16200 if (
Y->isDivergent())
16203 DCI.AddToWorklist(UniformInner.
getNode());
16211 DAGCombinerInfo &DCI)
const {
16212 SelectionDAG &DAG = DCI.DAG;
16213 EVT VT =
N->getValueType(0);
16215 if (VT == MVT::i64) {
16216 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16220 if (VT != MVT::i32)
16229 unsigned Opc =
RHS.getOpcode();
16236 auto Cond =
RHS.getOperand(0);
16241 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16259SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
16260 DAGCombinerInfo &DCI)
const {
16262 if (
N->getValueType(0) != MVT::i32)
16268 SelectionDAG &DAG = DCI.DAG;
16273 unsigned LHSOpc =
LHS.getOpcode();
16274 unsigned Opc =
N->getOpcode();
16278 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
16284 DAGCombinerInfo &DCI)
const {
16288 SelectionDAG &DAG = DCI.DAG;
16289 EVT VT =
N->getValueType(0);
16301 if (
A ==
LHS.getOperand(1)) {
16302 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16303 if (FusedOp != 0) {
16305 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
16313 if (
A ==
RHS.getOperand(1)) {
16314 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16315 if (FusedOp != 0) {
16317 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
16326 DAGCombinerInfo &DCI)
const {
16330 SelectionDAG &DAG = DCI.DAG;
16332 EVT VT =
N->getValueType(0);
16345 if (
A ==
LHS.getOperand(1)) {
16346 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16347 if (FusedOp != 0) {
16351 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
16360 if (
A ==
RHS.getOperand(1)) {
16361 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16362 if (FusedOp != 0) {
16364 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
16373 DAGCombinerInfo &DCI)
const {
16374 SelectionDAG &DAG = DCI.DAG;
16376 EVT VT =
N->getValueType(0);
16377 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16383 SDNodeFlags
Flags =
N->getFlags();
16384 SDNodeFlags RHSFlags =
RHS->getFlags();
16390 bool IsNegative =
false;
16391 if (CLHS->isExactlyValue(1.0) ||
16392 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16395 if (
RHS.getOpcode() == ISD::FSQRT) {
16399 return IsNegative ? DAG.
getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16408 DAGCombinerInfo &DCI)
const {
16409 SelectionDAG &DAG = DCI.DAG;
16410 EVT VT =
N->getValueType(0);
16414 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
16415 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16430 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16435 const ConstantFPSDNode *FalseNode =
16445 if (ScalarVT == MVT::f32 &&
16451 if (TrueNodeExpVal == INT_MIN)
16454 if (FalseNodeExpVal == INT_MIN)
16467 return DAG.
getNode(ISD::FLDEXP, SL, VT,
LHS, SelectNode,
N->getFlags());
16474 DAGCombinerInfo &DCI)
const {
16475 SelectionDAG &DAG = DCI.DAG;
16476 EVT VT =
N->getValueType(0);
16479 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16497 (
N->getFlags().hasAllowContract() &&
16498 FMA->getFlags().hasAllowContract())) {
16513 if (FMAOp1.
getOpcode() != ISD::FP_EXTEND ||
16532 if (Vec1 == Vec2 || Vec3 == Vec4)
16538 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16547 DAGCombinerInfo &DCI)
const {
16548 SelectionDAG &DAG = DCI.DAG;
16553 EVT VT =
LHS.getValueType();
16582 return LHS.getOperand(0);
16590 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
16597 const APInt &CT =
LHS.getConstantOperandAPInt(1);
16598 const APInt &CF =
LHS.getConstantOperandAPInt(2);
16606 return LHS.getOperand(0);
16610 if (VT != MVT::f32 && VT != MVT::f64 &&
16611 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16619 LHS.getOpcode() == ISD::FABS) {
16626 const unsigned IsInfMask =
16628 const unsigned IsFiniteMask =
16642SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
16643 DAGCombinerInfo &DCI)
const {
16644 SelectionDAG &DAG = DCI.DAG;
16665 unsigned ShiftOffset = 8 *
Offset;
16667 ShiftOffset -=
C->getZExtValue();
16669 ShiftOffset +=
C->getZExtValue();
16671 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16673 MVT::f32, Shifted);
16684 DCI.AddToWorklist(
N);
16691 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
16697 DAGCombinerInfo &DCI)
const {
16702 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16706 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16707 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
16710 APFloat One(
F.getSemantics(),
"1.0");
16712 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
16718 DAGCombinerInfo &DCI)
const {
16739 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
16740 bool isInteger =
LHS.getValueType().isInteger();
16743 if (!isFloatingPoint && !isInteger)
16748 if (!isEquality && !isNonEquality)
16765 if (isFloatingPoint) {
16767 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16778 if (!(isEquality && TrueVal == ConstVal) &&
16779 !(isNonEquality && FalseVal == ConstVal))
16786 SelectLHS, SelectRHS);
16791 switch (
N->getOpcode()) {
16807 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
16817 switch (
N->getOpcode()) {
16819 return performAddCombine(
N, DCI);
16821 return performPtrAddCombine(
N, DCI);
16823 return performSubCombine(
N, DCI);
16826 return performAddCarrySubCarryCombine(
N, DCI);
16828 return performFAddCombine(
N, DCI);
16830 return performFSubCombine(
N, DCI);
16832 return performFDivCombine(
N, DCI);
16834 return performFMulCombine(
N, DCI);
16836 return performSetCCCombine(
N, DCI);
16838 if (
auto Res = performSelectCombine(
N, DCI))
16843 case ISD::FMAXNUM_IEEE:
16844 case ISD::FMINNUM_IEEE:
16845 case ISD::FMAXIMUM:
16846 case ISD::FMINIMUM:
16847 case ISD::FMAXIMUMNUM:
16848 case ISD::FMINIMUMNUM:
16855 return performMinMaxCombine(
N, DCI);
16857 return performFMACombine(
N, DCI);
16859 return performAndCombine(
N, DCI);
16861 return performOrCombine(
N, DCI);
16864 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
16865 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16871 return performXorCombine(
N, DCI);
16873 return performZeroExtendCombine(
N, DCI);
16875 return performSignExtendInRegCombine(
N, DCI);
16877 return performClassCombine(
N, DCI);
16879 return performFCanonicalizeCombine(
N, DCI);
16881 return performRcpCombine(
N, DCI);
16896 return performUCharToFloatCombine(
N, DCI);
16898 return performFCopySignCombine(
N, DCI);
16903 return performCvtF32UByteNCombine(
N, DCI);
16905 return performFMed3Combine(
N, DCI);
16907 return performCvtPkRTZCombine(
N, DCI);
16909 return performClampCombine(
N, DCI);
16912 EVT VT =
N->getValueType(0);
16915 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16918 EVT EltVT = Src.getValueType();
16919 if (EltVT != MVT::i16)
16920 Src = DAG.
getNode(ISD::BITCAST, SL, MVT::i16, Src);
16923 return DAG.
getNode(ISD::BITCAST, SL, VT, Ext);
16929 return performExtractVectorEltCombine(
N, DCI);
16931 return performInsertVectorEltCombine(
N, DCI);
16933 return performFPRoundCombine(
N, DCI);
16942 return performMemSDNodeCombine(MemNode, DCI);
16973 unsigned Opcode =
Node->getMachineOpcode();
16976 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16977 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
16980 SDNode *
Users[5] = {
nullptr};
16982 unsigned DmaskIdx =
16983 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16984 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
16985 unsigned NewDmask = 0;
16986 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16987 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16988 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
16989 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
16990 unsigned TFCLane = 0;
16991 bool HasChain =
Node->getNumValues() > 1;
16993 if (OldDmask == 0) {
17001 TFCLane = OldBitsSet;
17005 for (SDUse &Use :
Node->uses()) {
17008 if (
Use.getResNo() != 0)
17011 SDNode *
User =
Use.getUser();
17014 if (!
User->isMachineOpcode() ||
17015 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17027 if (UsesTFC && Lane == TFCLane) {
17032 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17034 Dmask &= ~(1 << Comp);
17042 NewDmask |= 1 << Comp;
17047 bool NoChannels = !NewDmask;
17054 if (OldBitsSet == 1)
17060 if (NewDmask == OldDmask)
17069 unsigned NewChannels = BitsSet + UsesTFC;
17073 assert(NewOpcode != -1 &&
17074 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
17075 "failed to find equivalent MIMG op");
17083 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
17085 MVT ResultVT = NewChannels == 1
17088 : NewChannels == 5 ? 8
17090 SDVTList NewVTList =
17093 MachineSDNode *NewNode =
17102 if (NewChannels == 1) {
17112 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17117 if (i || !NoChannels)
17122 if (NewUser != User) {
17132 Idx = AMDGPU::sub1;
17135 Idx = AMDGPU::sub2;
17138 Idx = AMDGPU::sub3;
17141 Idx = AMDGPU::sub4;
17152 Op =
Op.getOperand(0);
17173 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17177 Node->getOperand(0), SL, VReg, SrcVal,
17183 return ToResultReg.
getNode();
17188 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
17190 Ops.push_back(
Node->getOperand(i));
17196 Node->getOperand(i).getValueType(),
17197 Node->getOperand(i)),
17209 unsigned Opcode =
Node->getMachineOpcode();
17211 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
17212 !
TII->isGather4(Opcode) &&
17214 return adjustWritemask(
Node, DAG);
17217 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17223 case AMDGPU::V_DIV_SCALE_F32_e64:
17224 case AMDGPU::V_DIV_SCALE_F64_e64: {
17234 (Src0 == Src1 || Src0 == Src2))
17290 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
17291 unsigned InitIdx = 0;
17293 if (
TII->isImage(
MI)) {
17301 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
17302 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
17303 unsigned D16Val = D16 ? D16->getImm() : 0;
17305 if (!TFEVal && !LWEVal)
17316 assert(MO_Dmask &&
"Expected dmask operand in instruction");
17318 unsigned dmask = MO_Dmask->
getImm();
17323 bool Packed = !Subtarget->hasUnpackedD16VMem();
17325 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17331 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
17332 if (DstSize < InitIdx)
17335 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
17343 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
17344 unsigned NewDst = 0;
17349 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17350 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17353 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17354 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
17374 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
17387 if (
TII->isVOP3(
MI.getOpcode())) {
17389 TII->legalizeOperandsVOP3(
MRI,
MI);
17394 if (!
MI.getDesc().operands().empty()) {
17395 unsigned Opc =
MI.getOpcode();
17396 bool HasAGPRs = Info->mayNeedAGPRs();
17398 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
17400 {AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src0),
17401 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1), Src2Idx}) {
17404 if ((
I == Src2Idx) && (HasAGPRs))
17407 if (!
Op.isReg() || !
Op.getReg().isVirtual())
17409 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
17410 if (!
TRI->hasAGPRs(RC))
17412 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
17413 if (!Src || !Src->isCopy() ||
17414 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
17416 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
17420 MRI.setRegClass(
Op.getReg(), NewRC);
17423 if (
TII->isMAI(
MI)) {
17428 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17429 AMDGPU::OpName::scale_src0);
17430 if (Src0Idx != -1) {
17431 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17432 AMDGPU::OpName::scale_src1);
17433 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
17434 TII->usesConstantBus(
MRI,
MI, Src1Idx))
17435 TII->legalizeOpWithMove(
MI, Src1Idx);
17443 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
17444 if (Src2->isReg() && Src2->getReg().isVirtual()) {
17445 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
17446 if (
TRI->isVectorSuperClass(RC)) {
17447 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
17448 MRI.setRegClass(Src2->getReg(), NewRC);
17449 if (Src2->isTied())
17450 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
17459 if (
TII->isImage(
MI))
17460 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
17534std::pair<unsigned, const TargetRegisterClass *>
17541 if (Constraint.
size() == 1) {
17545 if (VT == MVT::Other)
17548 switch (Constraint[0]) {
17555 RC = &AMDGPU::SReg_32RegClass;
17558 RC = &AMDGPU::SGPR_64RegClass;
17563 return std::pair(0U,
nullptr);
17570 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17571 : &AMDGPU::VGPR_32_Lo256RegClass;
17574 RC = Subtarget->has1024AddressableVGPRs()
17575 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
17578 return std::pair(0U,
nullptr);
17583 if (!Subtarget->hasMAIInsts())
17587 RC = &AMDGPU::AGPR_32RegClass;
17592 return std::pair(0U,
nullptr);
17597 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
17601 RC = &AMDGPU::AV_32RegClass;
17604 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
17606 return std::pair(0U,
nullptr);
17615 return std::pair(0U, RC);
17618 if (Kind !=
'\0') {
17620 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17621 }
else if (Kind ==
's') {
17622 RC = &AMDGPU::SGPR_32RegClass;
17623 }
else if (Kind ==
'a') {
17624 RC = &AMDGPU::AGPR_32RegClass;
17630 return std::pair(0U,
nullptr);
17636 return std::pair(0U,
nullptr);
17640 RC =
TRI->getVGPRClassForBitWidth(Width);
17642 RC =
TRI->getSGPRClassForBitWidth(Width);
17644 RC =
TRI->getAGPRClassForBitWidth(Width);
17646 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17651 return std::pair(0U,
nullptr);
17653 return std::pair(Reg, RC);
17659 return std::pair(0U,
nullptr);
17660 if (Idx < RC->getNumRegs())
17662 return std::pair(0U,
nullptr);
17668 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
17674 if (Constraint.
size() == 1) {
17675 switch (Constraint[0]) {
17685 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
17693 if (Constraint.
size() == 1) {
17694 switch (Constraint[0]) {
17702 }
else if (Constraint.
size() == 2) {
17703 if (Constraint ==
"VA")
17721 std::vector<SDValue> &
Ops,
17736 unsigned Size =
Op.getScalarValueSizeInBits();
17740 if (
Size == 16 && !Subtarget->has16BitInsts())
17744 Val =
C->getSExtValue();
17748 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17752 if (
Size != 16 ||
Op.getNumOperands() != 2)
17754 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
17757 Val =
C->getSExtValue();
17761 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17771 if (Constraint.
size() == 1) {
17772 switch (Constraint[0]) {
17787 }
else if (Constraint.
size() == 2) {
17788 if (Constraint ==
"DA") {
17789 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
17790 int64_t LoBits =
static_cast<int32_t
>(Val);
17794 if (Constraint ==
"DB") {
17802 unsigned MaxSize)
const {
17803 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
17804 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17806 MVT VT =
Op.getSimpleValueType();
17831 switch (UnalignedClassID) {
17832 case AMDGPU::VReg_64RegClassID:
17833 return AMDGPU::VReg_64_Align2RegClassID;
17834 case AMDGPU::VReg_96RegClassID:
17835 return AMDGPU::VReg_96_Align2RegClassID;
17836 case AMDGPU::VReg_128RegClassID:
17837 return AMDGPU::VReg_128_Align2RegClassID;
17838 case AMDGPU::VReg_160RegClassID:
17839 return AMDGPU::VReg_160_Align2RegClassID;
17840 case AMDGPU::VReg_192RegClassID:
17841 return AMDGPU::VReg_192_Align2RegClassID;
17842 case AMDGPU::VReg_224RegClassID:
17843 return AMDGPU::VReg_224_Align2RegClassID;
17844 case AMDGPU::VReg_256RegClassID:
17845 return AMDGPU::VReg_256_Align2RegClassID;
17846 case AMDGPU::VReg_288RegClassID:
17847 return AMDGPU::VReg_288_Align2RegClassID;
17848 case AMDGPU::VReg_320RegClassID:
17849 return AMDGPU::VReg_320_Align2RegClassID;
17850 case AMDGPU::VReg_352RegClassID:
17851 return AMDGPU::VReg_352_Align2RegClassID;
17852 case AMDGPU::VReg_384RegClassID:
17853 return AMDGPU::VReg_384_Align2RegClassID;
17854 case AMDGPU::VReg_512RegClassID:
17855 return AMDGPU::VReg_512_Align2RegClassID;
17856 case AMDGPU::VReg_1024RegClassID:
17857 return AMDGPU::VReg_1024_Align2RegClassID;
17858 case AMDGPU::AReg_64RegClassID:
17859 return AMDGPU::AReg_64_Align2RegClassID;
17860 case AMDGPU::AReg_96RegClassID:
17861 return AMDGPU::AReg_96_Align2RegClassID;
17862 case AMDGPU::AReg_128RegClassID:
17863 return AMDGPU::AReg_128_Align2RegClassID;
17864 case AMDGPU::AReg_160RegClassID:
17865 return AMDGPU::AReg_160_Align2RegClassID;
17866 case AMDGPU::AReg_192RegClassID:
17867 return AMDGPU::AReg_192_Align2RegClassID;
17868 case AMDGPU::AReg_256RegClassID:
17869 return AMDGPU::AReg_256_Align2RegClassID;
17870 case AMDGPU::AReg_512RegClassID:
17871 return AMDGPU::AReg_512_Align2RegClassID;
17872 case AMDGPU::AReg_1024RegClassID:
17873 return AMDGPU::AReg_1024_Align2RegClassID;
17889 if (Info->isEntryFunction()) {
17896 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17898 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17899 :
TRI->getAlignedHighSGPRForRC(MF, 2,
17900 &AMDGPU::SGPR_64RegClass);
17901 Info->setSGPRForEXECCopy(SReg);
17903 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
17904 Info->getStackPtrOffsetReg()));
17905 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17906 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17910 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17911 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17913 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17914 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17916 Info->limitOccupancy(MF);
17918 if (ST.isWave32() && !MF.
empty()) {
17919 for (
auto &
MBB : MF) {
17920 for (
auto &
MI :
MBB) {
17921 TII->fixImplicitOperands(
MI);
17931 if (ST.needsAlignedVGPRs()) {
17932 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
17938 if (NewClassID != -1)
17939 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
17948 const APInt &DemandedElts,
17950 unsigned Depth)
const {
17952 unsigned Opc =
Op.getOpcode();
17955 unsigned IID =
Op.getConstantOperandVal(0);
17957 case Intrinsic::amdgcn_mbcnt_lo:
17958 case Intrinsic::amdgcn_mbcnt_hi: {
17964 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17974 Op, Known, DemandedElts, DAG,
Depth);
17990 unsigned MaxValue =
17997 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
18001 unsigned Src1Cst = 0;
18002 if (Src1.
isImm()) {
18003 Src1Cst = Src1.
getImm();
18004 }
else if (Src1.
isReg()) {
18008 Src1Cst = Cst->Value.getZExtValue();
18019 if (Width >= BFEWidth)
18028 Known = Known.
sext(BFEWidth);
18030 Known = Known.
zext(BFEWidth);
18036 unsigned Depth)
const {
18039 switch (
MI->getOpcode()) {
18040 case AMDGPU::S_BFE_I32:
18043 case AMDGPU::S_BFE_U32:
18046 case AMDGPU::S_BFE_I64:
18049 case AMDGPU::S_BFE_U64:
18052 case AMDGPU::G_INTRINSIC:
18053 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18056 case Intrinsic::amdgcn_workitem_id_x:
18059 case Intrinsic::amdgcn_workitem_id_y:
18062 case Intrinsic::amdgcn_workitem_id_z:
18065 case Intrinsic::amdgcn_mbcnt_lo:
18066 case Intrinsic::amdgcn_mbcnt_hi: {
18078 case Intrinsic::amdgcn_groupstaticsize: {
18089 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18092 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18095 case AMDGPU::G_AMDGPU_SMED3:
18096 case AMDGPU::G_AMDGPU_UMED3: {
18097 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
18124 unsigned Depth)
const {
18131 AttributeList Attrs =
18133 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
18160 if (Header->getAlignment() != PrefAlign)
18161 return Header->getAlignment();
18163 unsigned LoopSize = 0;
18168 LoopSize +=
MBB->getAlignment().value() / 2;
18171 LoopSize +=
TII->getInstSizeInBytes(
MI);
18172 if (LoopSize > 192)
18177 if (LoopSize <= 64)
18180 if (LoopSize <= 128)
18181 return CacheLineAlign;
18187 auto I = Exit->getFirstNonDebugInstr();
18188 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18189 return CacheLineAlign;
18198 if (PreTerm == Pre->
begin() ||
18199 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18203 auto ExitHead = Exit->getFirstNonDebugInstr();
18204 if (ExitHead == Exit->end() ||
18205 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18210 return CacheLineAlign;
18218 N =
N->getOperand(0).getNode();
18219 if (
N->getOpcode() == ISD::INLINEASM ||
N->getOpcode() == ISD::INLINEASM_BR)
18228 switch (
N->getOpcode()) {
18236 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
18237 return !
TRI->isSGPRReg(
MRI, Reg);
18243 return !
TRI->isSGPRReg(
MRI, Reg);
18247 unsigned AS = L->getAddressSpace();
18251 case ISD::CALLSEQ_END:
18280 return A->readMem() &&
A->writeMem();
18301 switch (Ty.getScalarSizeInBits()) {
18313 const APInt &DemandedElts,
18316 unsigned Depth)
const {
18321 if (Info->getMode().DX10Clamp)
18333 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
18353 <<
"Hardware instruction generated for atomic "
18355 <<
" operation at memory scope " << MemScope;
18360 Type *EltTy = VT->getElementType();
18361 return VT->getNumElements() == 2 &&
18381 unsigned BW =
IT->getBitWidth();
18382 return BW == 32 || BW == 64;
18396 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
18397 return BW == 32 || BW == 64;
18400 if (Ty->isFloatTy() || Ty->isDoubleTy())
18404 return VT->getNumElements() == 2 &&
18405 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18415 bool HasSystemScope) {
18422 if (HasSystemScope) {
18431 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
18444 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
18470 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
18483 bool HasSystemScope =
18509 if (Subtarget->hasEmulatedSystemScopeAtomics())
18525 if (!HasSystemScope &&
18526 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18538 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
18546 ConstVal && ConstVal->isNullValue())
18584 if (Ty->isFloatTy()) {
18589 if (Ty->isDoubleTy()) {
18610 if (Ty->isFloatTy() &&
18611 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18624 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18628 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
18632 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18637 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
18642 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18646 if (Ty->isFloatTy()) {
18649 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18652 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18657 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18665 if (Subtarget->hasFlatAtomicFaddF32Inst())
18674 if (Subtarget->hasLDSFPAtomicAddF32()) {
18675 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18677 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18705 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18707 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18711 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18713 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18766 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18767 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18768 : &AMDGPU::SReg_32RegClass;
18769 if (!
TRI->isSGPRClass(RC) && !isDivergent)
18770 return TRI->getEquivalentSGPRClass(RC);
18771 if (
TRI->isSGPRClass(RC) && isDivergent)
18772 return TRI->getEquivalentVGPRClass(RC);
18784 unsigned WaveSize) {
18789 if (!
IT ||
IT->getBitWidth() != WaveSize)
18794 if (!Visited.
insert(V).second)
18796 bool Result =
false;
18797 for (
const auto *U : V->users()) {
18799 if (V == U->getOperand(1)) {
18804 case Intrinsic::amdgcn_if_break:
18805 case Intrinsic::amdgcn_if:
18806 case Intrinsic::amdgcn_else:
18811 if (V == U->getOperand(0)) {
18816 case Intrinsic::amdgcn_end_cf:
18817 case Intrinsic::amdgcn_loop:
18823 Result =
hasCFUser(U, Visited, WaveSize);
18832 const Value *V)
const {
18834 if (CI->isInlineAsm()) {
18843 for (
auto &TC : TargetConstraints) {
18857 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18885 return MRI.hasOneNonDBGUse(N0);
18892 if (
I.getMetadata(
"amdgpu.noclobber"))
18894 if (
I.getMetadata(
"amdgpu.last.use"))
18904 if (!Def->isMachineOpcode())
18914 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18915 PhysReg = AMDGPU::SCC;
18917 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18983 Alignment = RMW->getAlign();
18996 bool FullFlatEmulation =
18998 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18999 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19000 RMW->getType()->isDoubleTy()));
19003 bool ReturnValueIsUsed = !AI->
use_empty();
19012 if (FullFlatEmulation) {
19023 std::prev(BB->
end())->eraseFromParent();
19024 Builder.SetInsertPoint(BB);
19026 Value *LoadedShared =
nullptr;
19027 if (FullFlatEmulation) {
19028 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19029 {Addr},
nullptr,
"is.shared");
19030 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19031 Builder.SetInsertPoint(SharedBB);
19032 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19038 LoadedShared = Clone;
19040 Builder.CreateBr(PhiBB);
19041 Builder.SetInsertPoint(CheckPrivateBB);
19044 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19045 {Addr},
nullptr,
"is.private");
19046 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19048 Builder.SetInsertPoint(PrivateBB);
19050 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19053 Value *LoadedPrivate;
19055 LoadedPrivate = Builder.CreateAlignedLoad(
19056 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
19059 LoadedPrivate, RMW->getValOperand());
19061 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19063 auto [ResultLoad, Equal] =
19069 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19072 Builder.CreateBr(PhiBB);
19074 Builder.SetInsertPoint(GlobalBB);
19078 if (FullFlatEmulation) {
19079 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19088 if (!FullFlatEmulation) {
19093 MDNode *RangeNotPrivate =
19096 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
19100 Builder.CreateBr(PhiBB);
19102 Builder.SetInsertPoint(PhiBB);
19104 if (ReturnValueIsUsed) {
19107 if (FullFlatEmulation)
19114 Builder.CreateBr(ExitBB);
19118 unsigned PtrOpIdx) {
19119 Value *PtrOp =
I->getOperand(PtrOpIdx);
19126 I->setOperand(PtrOpIdx, ASCast);
19138 ConstVal && ConstVal->isNullValue()) {
19168 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19176 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19191 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ TC_RETURN_GFX_WholeWave
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ SMULO
Same for multiplication.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
int popcount(T Value) noexcept
Count the number of set bits in a value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const