40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
52#define DEBUG_TYPE "si-lower"
58 cl::desc(
"Do not align and prefetch loops"),
62 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
63 cl::desc(
"Use indirect register addressing for divergent indexes"),
70 cl::desc(
"Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
85 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
86 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
88 return AMDGPU::SGPR0 +
Reg;
160 if (Subtarget->has16BitInsts()) {
161 if (Subtarget->useRealTrue16Insts()) {
203 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
204 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
205 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
206 MVT::i1, MVT::v32i32},
210 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
211 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
212 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
213 MVT::i1, MVT::v32i32},
220 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
221 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
222 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
223 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
224 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
282 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
289 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
290 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
291 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
294 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
295 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
296 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
300 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
301 MVT::v3i16, MVT::v4i16, MVT::Other},
306 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
322 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
323 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
324 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
325 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
326 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
327 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
328 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
329 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
361 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
375 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
389 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
403 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
417 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
432 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
433 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
436 if (Subtarget->hasPkMovB32()) {
457 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
458 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
463 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
467 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
468 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
469 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
470 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
494 if (Subtarget->hasSMemRealTime() ||
499 if (Subtarget->has16BitInsts()) {
506 if (Subtarget->hasMadMacF32Insts())
509 if (!Subtarget->hasBFI())
513 if (!Subtarget->hasBCNT(32))
516 if (!Subtarget->hasBCNT(64))
519 if (Subtarget->hasFFBH())
522 if (Subtarget->hasFFBL())
533 if (Subtarget->hasBFE())
537 if (Subtarget->hasIntClamp())
540 if (Subtarget->hasAddNoCarry())
545 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
546 {MVT::f32, MVT::f64},
Custom);
552 {MVT::f32, MVT::f64},
Legal);
554 if (Subtarget->haveRoundOpsF64())
577 if (Subtarget->has16BitInsts()) {
626 ISD::FSIN, ISD::FROUND},
630 if (Subtarget->hasBF16TransInsts())
649 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
650 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
651 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
784 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
785 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
786 MVT::v32f16, MVT::v32bf16},
790 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
796 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
800 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
804 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
805 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
813 if (Subtarget->hasVOP3PInsts()) {
824 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
827 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
828 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
829 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
832 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
840 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
846 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
847 {MVT::v2f16, MVT::v4f16},
Custom);
853 if (Subtarget->hasPackedFP32Ops()) {
857 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
864 if (Subtarget->has16BitInsts()) {
877 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
878 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
879 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
880 MVT::v32f16, MVT::v32bf16},
885 if (Subtarget->hasVectorMulU64())
887 else if (Subtarget->hasScalarSMulU64())
890 if (Subtarget->hasMad64_32())
893 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
896 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
898 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
901 if (Subtarget->hasMinimum3Maximum3F32())
904 if (Subtarget->hasMinimum3Maximum3PKF16()) {
908 if (!Subtarget->hasMinimum3Maximum3F16())
913 if (Subtarget->hasVOP3PInsts()) {
916 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
920 if (Subtarget->hasIntMinMax64())
925 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
926 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
931 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
932 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
933 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
934 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
938 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
939 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
940 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
941 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
956 if (Subtarget->hasBF16ConversionInsts()) {
961 if (Subtarget->hasBF16PackedInsts()) {
967 if (Subtarget->hasBF16TransInsts()) {
971 if (Subtarget->hasCvtPkF16F32Inst()) {
973 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1023 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1032 ISD::ATOMIC_CMP_SWAP,
1033 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1035 ISD::ATOMIC_LOAD_ADD,
1036 ISD::ATOMIC_LOAD_SUB,
1037 ISD::ATOMIC_LOAD_AND,
1038 ISD::ATOMIC_LOAD_OR,
1039 ISD::ATOMIC_LOAD_XOR,
1040 ISD::ATOMIC_LOAD_NAND,
1041 ISD::ATOMIC_LOAD_MIN,
1042 ISD::ATOMIC_LOAD_MAX,
1043 ISD::ATOMIC_LOAD_UMIN,
1044 ISD::ATOMIC_LOAD_UMAX,
1045 ISD::ATOMIC_LOAD_FADD,
1046 ISD::ATOMIC_LOAD_FMIN,
1047 ISD::ATOMIC_LOAD_FMAX,
1048 ISD::ATOMIC_LOAD_UINC_WRAP,
1049 ISD::ATOMIC_LOAD_UDEC_WRAP,
1062 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1075 EVT DestVT,
EVT SrcVT)
const {
1077 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1078 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1080 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1087 LLT DestTy,
LLT SrcTy)
const {
1088 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1089 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1091 SrcTy.getScalarSizeInBits() == 16 &&
1112 if (Subtarget->has16BitInsts()) {
1115 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1117 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1121 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1143 if (
Size == 16 && Subtarget->has16BitInsts())
1144 return (NumElts + 1) / 2;
1150 return NumElts * ((
Size + 31) / 32);
1159 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1167 if (
Size == 16 && Subtarget->has16BitInsts()) {
1168 if (ScalarVT == MVT::bf16) {
1169 RegisterVT = MVT::i32;
1170 IntermediateVT = MVT::v2bf16;
1172 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1173 IntermediateVT = RegisterVT;
1175 NumIntermediates = (NumElts + 1) / 2;
1176 return NumIntermediates;
1181 IntermediateVT = RegisterVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1188 RegisterVT = MVT::i16;
1189 IntermediateVT = ScalarVT;
1190 NumIntermediates = NumElts;
1191 return NumIntermediates;
1195 RegisterVT = MVT::i32;
1196 IntermediateVT = ScalarVT;
1197 NumIntermediates = NumElts;
1198 return NumIntermediates;
1202 RegisterVT = MVT::i32;
1203 IntermediateVT = RegisterVT;
1204 NumIntermediates = NumElts * ((
Size + 31) / 32);
1205 return NumIntermediates;
1210 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1215 unsigned MaxNumLanes) {
1216 assert(MaxNumLanes != 0);
1220 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1231 unsigned MaxNumLanes) {
1237 assert(ST->getNumContainedTypes() == 2 &&
1238 ST->getContainedType(1)->isIntegerTy(32));
1252 return MVT::amdgpuBufferFatPointer;
1254 DL.getPointerSizeInBits(AS) == 192)
1255 return MVT::amdgpuBufferStridedPointer;
1264 DL.getPointerSizeInBits(AS) == 160) ||
1266 DL.getPointerSizeInBits(AS) == 192))
1273 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1274 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1275 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1277 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1278 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1279 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1280 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1281 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1283 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1284 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1285 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1286 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1287 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1289 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1290 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1291 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1292 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1293 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1332 unsigned IntrID)
const {
1334 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1352 if (RsrcIntr->IsImage) {
1367 Info.ptrVal = RsrcArg;
1370 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1379 if (RsrcIntr->IsImage) {
1380 unsigned MaxNumLanes = 4;
1395 std::numeric_limits<unsigned>::max());
1405 if (RsrcIntr->IsImage) {
1426 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1428 Info.memVT = MVT::i32;
1435 case Intrinsic::amdgcn_raw_buffer_load_lds:
1436 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1437 case Intrinsic::amdgcn_struct_buffer_load_lds:
1438 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1444 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1445 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1446 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1447 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1450 std::numeric_limits<unsigned>::max());
1460 case Intrinsic::amdgcn_ds_ordered_add:
1461 case Intrinsic::amdgcn_ds_ordered_swap: {
1474 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1475 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1478 Info.ptrVal =
nullptr;
1483 case Intrinsic::amdgcn_ds_append:
1484 case Intrinsic::amdgcn_ds_consume: {
1497 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1498 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1499 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1504 Info.memVT = MVT::i64;
1510 case Intrinsic::amdgcn_global_atomic_csub: {
1519 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1520 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1521 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1524 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1527 ->getElementType(0));
1535 case Intrinsic::amdgcn_global_atomic_fmin_num:
1536 case Intrinsic::amdgcn_global_atomic_fmax_num:
1537 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1538 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1539 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1540 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1550 case Intrinsic::amdgcn_flat_load_monitor_b32:
1551 case Intrinsic::amdgcn_flat_load_monitor_b64:
1552 case Intrinsic::amdgcn_flat_load_monitor_b128:
1553 case Intrinsic::amdgcn_global_load_monitor_b32:
1554 case Intrinsic::amdgcn_global_load_monitor_b64:
1555 case Intrinsic::amdgcn_global_load_monitor_b128:
1556 case Intrinsic::amdgcn_cluster_load_b32:
1557 case Intrinsic::amdgcn_cluster_load_b64:
1558 case Intrinsic::amdgcn_cluster_load_b128:
1559 case Intrinsic::amdgcn_ds_load_tr6_b96:
1560 case Intrinsic::amdgcn_ds_load_tr4_b64:
1561 case Intrinsic::amdgcn_ds_load_tr8_b64:
1562 case Intrinsic::amdgcn_ds_load_tr16_b128:
1563 case Intrinsic::amdgcn_global_load_tr6_b96:
1564 case Intrinsic::amdgcn_global_load_tr4_b64:
1565 case Intrinsic::amdgcn_global_load_tr_b64:
1566 case Intrinsic::amdgcn_global_load_tr_b128:
1567 case Intrinsic::amdgcn_ds_read_tr4_b64:
1568 case Intrinsic::amdgcn_ds_read_tr6_b96:
1569 case Intrinsic::amdgcn_ds_read_tr8_b64:
1570 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1578 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1579 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1580 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1588 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1589 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1590 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1598 case Intrinsic::amdgcn_ds_gws_init:
1599 case Intrinsic::amdgcn_ds_gws_barrier:
1600 case Intrinsic::amdgcn_ds_gws_sema_v:
1601 case Intrinsic::amdgcn_ds_gws_sema_br:
1602 case Intrinsic::amdgcn_ds_gws_sema_p:
1603 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1613 Info.memVT = MVT::i32;
1615 Info.align =
Align(4);
1617 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1623 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1624 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1625 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1626 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1627 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1628 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1629 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1630 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1637 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1638 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1639 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1640 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1647 case Intrinsic::amdgcn_load_to_lds:
1648 case Intrinsic::amdgcn_global_load_lds: {
1656 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1657 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1658 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1659 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1669 Info.memVT = MVT::i32;
1671 Info.align =
Align(4);
1676 case Intrinsic::amdgcn_s_prefetch_data:
1677 case Intrinsic::amdgcn_flat_prefetch:
1678 case Intrinsic::amdgcn_global_prefetch: {
1693 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1696 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1697 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1709 Type *&AccessTy)
const {
1711 switch (
II->getIntrinsicID()) {
1712 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1713 case Intrinsic::amdgcn_cluster_load_b128:
1714 case Intrinsic::amdgcn_cluster_load_b64:
1715 case Intrinsic::amdgcn_cluster_load_b32:
1716 case Intrinsic::amdgcn_ds_append:
1717 case Intrinsic::amdgcn_ds_consume:
1718 case Intrinsic::amdgcn_ds_load_tr8_b64:
1719 case Intrinsic::amdgcn_ds_load_tr16_b128:
1720 case Intrinsic::amdgcn_ds_load_tr4_b64:
1721 case Intrinsic::amdgcn_ds_load_tr6_b96:
1722 case Intrinsic::amdgcn_ds_read_tr4_b64:
1723 case Intrinsic::amdgcn_ds_read_tr6_b96:
1724 case Intrinsic::amdgcn_ds_read_tr8_b64:
1725 case Intrinsic::amdgcn_ds_read_tr16_b64:
1726 case Intrinsic::amdgcn_ds_ordered_add:
1727 case Intrinsic::amdgcn_ds_ordered_swap:
1728 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1729 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1730 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1731 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1732 case Intrinsic::amdgcn_flat_load_monitor_b128:
1733 case Intrinsic::amdgcn_flat_load_monitor_b32:
1734 case Intrinsic::amdgcn_flat_load_monitor_b64:
1735 case Intrinsic::amdgcn_global_atomic_csub:
1736 case Intrinsic::amdgcn_global_atomic_fmax_num:
1737 case Intrinsic::amdgcn_global_atomic_fmin_num:
1738 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1739 case Intrinsic::amdgcn_global_load_monitor_b128:
1740 case Intrinsic::amdgcn_global_load_monitor_b32:
1741 case Intrinsic::amdgcn_global_load_monitor_b64:
1742 case Intrinsic::amdgcn_global_load_tr_b64:
1743 case Intrinsic::amdgcn_global_load_tr_b128:
1744 case Intrinsic::amdgcn_global_load_tr4_b64:
1745 case Intrinsic::amdgcn_global_load_tr6_b96:
1746 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1747 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1748 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1749 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1750 Ptr =
II->getArgOperand(0);
1752 case Intrinsic::amdgcn_load_to_lds:
1753 case Intrinsic::amdgcn_global_load_lds:
1754 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1755 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1756 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1757 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1758 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1759 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1760 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1761 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1762 Ptr =
II->getArgOperand(1);
1767 AccessTy =
II->getType();
1773 unsigned AddrSpace)
const {
1774 if (!Subtarget->hasFlatInstOffsets()) {
1785 return AM.
Scale == 0 &&
1786 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1787 AM.
BaseOffs, AddrSpace, FlatVariant));
1791 if (Subtarget->hasFlatGlobalInsts())
1794 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1807 return isLegalMUBUFAddressingMode(AM);
1810bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1821 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1833 if (AM.HasBaseReg) {
1865 return isLegalMUBUFAddressingMode(AM);
1867 if (!Subtarget->hasScalarSubwordLoads()) {
1872 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1920 return Subtarget->enableFlatScratch()
1922 : isLegalMUBUFAddressingMode(AM);
1969 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1978 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
1981 Align RequiredAlignment(
1983 if (Subtarget->hasLDSMisalignedBug() &&
Size > 32 &&
1984 Alignment < RequiredAlignment)
1999 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
2005 RequiredAlignment =
Align(4);
2007 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2023 *IsFast = (Alignment >= RequiredAlignment) ? 64
2024 : (Alignment <
Align(4)) ? 32
2031 if (!Subtarget->hasDS96AndDS128())
2037 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2046 *IsFast = (Alignment >= RequiredAlignment) ? 96
2047 : (Alignment <
Align(4)) ? 32
2054 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2060 RequiredAlignment =
Align(8);
2062 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2071 *IsFast = (Alignment >= RequiredAlignment) ? 128
2072 : (Alignment <
Align(4)) ? 32
2089 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2091 return Alignment >= RequiredAlignment ||
2092 Subtarget->hasUnalignedDSAccessEnabled();
2100 bool AlignedBy4 = Alignment >=
Align(4);
2102 *IsFast = AlignedBy4;
2104 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
2113 return Alignment >=
Align(4) ||
2114 Subtarget->hasUnalignedBufferAccessEnabled();
2126 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2141 return Size >= 32 && Alignment >=
Align(4);
2146 unsigned *IsFast)
const {
2148 Alignment, Flags, IsFast);
2153 const AttributeList &FuncAttributes)
const {
2159 if (
Op.size() >= 16 &&
2163 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2181 unsigned DestAS)
const {
2184 Subtarget->hasGloballyAddressableScratch()) {
2214 unsigned Index)
const {
2230 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2257 auto [InputPtrReg, RC, ArgTy] =
2267 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2273 const SDLoc &SL)
const {
2280 const SDLoc &SL)
const {
2283 std::optional<uint32_t> KnownSize =
2285 if (KnownSize.has_value())
2311 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2320SDValue SITargetLowering::lowerKernargMemParameter(
2332 int64_t OffsetDiff =
Offset - AlignDownOffset;
2338 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2347 ArgVal = DAG.
getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2348 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2358 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2367 const SDLoc &SL)
const {
2377 return DAG.
getNode(ISD::BITCAST, SL, ValVT, Val);
2436 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2439 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2440 if (ConvertedVal == ArgValue)
2441 return ConvertedVal;
2446SDValue SITargetLowering::lowerWorkGroupId(
2451 if (!Subtarget->hasClusters())
2452 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2460 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2461 SDLoc SL(ClusterIdXYZ);
2462 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2465 SDValue ClusterWorkGroupIdXYZ =
2466 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2476 return ClusterIdXYZ;
2478 using namespace AMDGPU::Hwreg;
2482 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2493SDValue SITargetLowering::getPreloadedValue(
2496 const ArgDescriptor *
Reg =
nullptr;
2497 const TargetRegisterClass *RC;
2501 const ArgDescriptor WorkGroupIDX =
2509 const ArgDescriptor WorkGroupIDZ =
2511 const ArgDescriptor ClusterWorkGroupIDX =
2513 const ArgDescriptor ClusterWorkGroupIDY =
2515 const ArgDescriptor ClusterWorkGroupIDZ =
2517 const ArgDescriptor ClusterWorkGroupMaxIDX =
2519 const ArgDescriptor ClusterWorkGroupMaxIDY =
2521 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2523 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2526 auto LoadConstant = [&](
unsigned N) {
2530 if (Subtarget->hasArchitectedSGPRs() &&
2537 Reg = &WorkGroupIDX;
2538 RC = &AMDGPU::SReg_32RegClass;
2542 Reg = &WorkGroupIDY;
2543 RC = &AMDGPU::SReg_32RegClass;
2547 Reg = &WorkGroupIDZ;
2548 RC = &AMDGPU::SReg_32RegClass;
2552 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2553 return LoadConstant(0);
2554 Reg = &ClusterWorkGroupIDX;
2555 RC = &AMDGPU::SReg_32RegClass;
2559 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2560 return LoadConstant(0);
2561 Reg = &ClusterWorkGroupIDY;
2562 RC = &AMDGPU::SReg_32RegClass;
2566 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2567 return LoadConstant(0);
2568 Reg = &ClusterWorkGroupIDZ;
2569 RC = &AMDGPU::SReg_32RegClass;
2574 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2575 Reg = &ClusterWorkGroupMaxIDX;
2576 RC = &AMDGPU::SReg_32RegClass;
2581 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2582 Reg = &ClusterWorkGroupMaxIDY;
2583 RC = &AMDGPU::SReg_32RegClass;
2588 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2589 Reg = &ClusterWorkGroupMaxIDZ;
2590 RC = &AMDGPU::SReg_32RegClass;
2594 Reg = &ClusterWorkGroupMaxFlatID;
2595 RC = &AMDGPU::SReg_32RegClass;
2626 for (
unsigned I = 0,
E = Ins.size(), PSInputNum = 0;
I !=
E; ++
I) {
2630 "vector type argument should have been split");
2635 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2643 "unexpected vector split in ps argument type");
2657 Info->markPSInputAllocated(PSInputNum);
2659 Info->markPSInputEnabled(PSInputNum);
2675 if (Info.hasWorkItemIDX()) {
2681 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2685 if (Info.hasWorkItemIDY()) {
2686 assert(Info.hasWorkItemIDX());
2687 if (Subtarget->hasPackedTID()) {
2688 Info.setWorkItemIDY(
2691 unsigned Reg = AMDGPU::VGPR1;
2699 if (Info.hasWorkItemIDZ()) {
2700 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2701 if (Subtarget->hasPackedTID()) {
2702 Info.setWorkItemIDZ(
2705 unsigned Reg = AMDGPU::VGPR2;
2725 if (RegIdx == ArgVGPRs.
size()) {
2732 unsigned Reg = ArgVGPRs[RegIdx];
2744 unsigned NumArgRegs) {
2747 if (RegIdx == ArgSGPRs.
size())
2750 unsigned Reg = ArgSGPRs[RegIdx];
2792 const unsigned Mask = 0x3ff;
2795 if (Info.hasWorkItemIDX()) {
2797 Info.setWorkItemIDX(Arg);
2800 if (Info.hasWorkItemIDY()) {
2802 Info.setWorkItemIDY(Arg);
2805 if (Info.hasWorkItemIDZ())
2817 const unsigned Mask = 0x3ff;
2826 auto &
ArgInfo = Info.getArgInfo();
2838 if (Info.hasImplicitArgPtr())
2846 if (Info.hasWorkGroupIDX())
2849 if (Info.hasWorkGroupIDY())
2852 if (Info.hasWorkGroupIDZ())
2855 if (Info.hasLDSKernelId())
2866 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2867 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2873 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2874 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2879 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2880 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2886 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2892 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
2901 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2906 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
2907 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2912 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
2913 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2928 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2930 bool InPreloadSequence =
true;
2932 bool AlignedForImplictArgs =
false;
2933 unsigned ImplicitArgOffset = 0;
2934 for (
auto &Arg :
F.args()) {
2935 if (!InPreloadSequence || !Arg.hasInRegAttr())
2938 unsigned ArgIdx = Arg.getArgNo();
2941 if (InIdx < Ins.size() &&
2942 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2945 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2946 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2948 assert(ArgLocs[ArgIdx].isMemLoc());
2949 auto &ArgLoc = ArgLocs[InIdx];
2951 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2953 unsigned NumAllocSGPRs =
2954 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2957 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2958 if (!AlignedForImplictArgs) {
2960 alignTo(LastExplicitArgOffset,
2961 Subtarget->getAlignmentForImplicitArgPtr()) -
2962 LastExplicitArgOffset;
2963 AlignedForImplictArgs =
true;
2965 ArgOffset += ImplicitArgOffset;
2969 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2970 assert(InIdx >= 1 &&
"No previous SGPR");
2971 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2972 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2976 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2977 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2980 InPreloadSequence =
false;
2986 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2988 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2990 if (PreloadRegs->
size() > 1)
2991 RC = &AMDGPU::SGPR_32RegClass;
2992 for (
auto &Reg : *PreloadRegs) {
2998 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3007 if (Info.hasLDSKernelId()) {
3008 Register Reg = Info.addLDSKernelId();
3009 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3018 bool IsShader)
const {
3019 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3020 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3026 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3028 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3032 unsigned NumRequiredSystemSGPRs =
3033 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3034 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3035 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3036 Register Reg = Info.addReservedUserSGPR();
3037 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3042 if (!HasArchitectedSGPRs) {
3043 if (Info.hasWorkGroupIDX()) {
3044 Register Reg = Info.addWorkGroupIDX();
3045 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3049 if (Info.hasWorkGroupIDY()) {
3050 Register Reg = Info.addWorkGroupIDY();
3051 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3055 if (Info.hasWorkGroupIDZ()) {
3056 Register Reg = Info.addWorkGroupIDZ();
3057 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3062 if (Info.hasWorkGroupInfo()) {
3063 Register Reg = Info.addWorkGroupInfo();
3064 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3068 if (Info.hasPrivateSegmentWaveByteOffset()) {
3070 unsigned PrivateSegmentWaveByteOffsetReg;
3073 PrivateSegmentWaveByteOffsetReg =
3074 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3078 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3080 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3083 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3085 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3086 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3089 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3090 Info.getNumPreloadedSGPRs() >= 16);
3105 if (HasStackObjects)
3106 Info.setHasNonSpillStackObjects(
true);
3111 HasStackObjects =
true;
3115 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3117 if (!ST.enableFlatScratch()) {
3118 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3125 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3127 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3137 Info.setScratchRSrcReg(ReservedBufferReg);
3156 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
3157 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3164 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3165 if (!
MRI.isLiveIn(
Reg)) {
3166 Info.setStackPtrOffsetReg(
Reg);
3171 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3178 if (ST.getFrameLowering()->hasFP(MF)) {
3179 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3195 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3204 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3205 RC = &AMDGPU::SGPR_64RegClass;
3206 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3207 RC = &AMDGPU::SGPR_32RegClass;
3213 Entry->addLiveIn(*
I);
3218 for (
auto *Exit : Exits)
3220 TII->get(TargetOpcode::COPY), *
I)
3235 bool IsError =
false;
3239 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3257 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3258 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3260 if (!Subtarget->enableFlatScratch())
3265 !Subtarget->hasArchitectedSGPRs())
3266 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3267 !Info->hasWorkGroupIDZ());
3270 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3288 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3289 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3292 Info->markPSInputAllocated(0);
3293 Info->markPSInputEnabled(0);
3295 if (Subtarget->isAmdPalOS()) {
3304 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3305 if ((PsInputBits & 0x7F) == 0 ||
3306 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3309 }
else if (IsKernel) {
3310 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3312 Splits.
append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3322 if (IsKernel && Subtarget->hasKernargPreload())
3326 }
else if (!IsGraphics) {
3331 if (!Subtarget->enableFlatScratch())
3343 Info->setNumWaveDispatchSGPRs(
3345 Info->setNumWaveDispatchVGPRs(
3347 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3348 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3353 if (IsWholeWaveFunc) {
3355 {MVT::i1, MVT::Other}, Chain);
3367 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3378 if (IsEntryFunc && VA.
isMemLoc()) {
3401 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3405 int64_t OffsetDiff =
Offset - AlignDownOffset;
3412 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3422 ArgVal = DAG.
getNode(ISD::BITCAST,
DL, MemVT, ArgVal);
3423 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3424 Ins[i].Flags.isSExt(), &Ins[i]);
3432 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3435 if (PreloadRegs.
size() == 1) {
3436 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3441 TRI->getRegSizeInBits(*RC)));
3449 for (
auto Reg : PreloadRegs) {
3456 PreloadRegs.size()),
3473 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3474 Ins[i].Flags.isSExt(), &Ins[i]);
3486 "hidden argument in kernel signature was not preloaded",
3492 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3493 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3513 if (!IsEntryFunc && VA.
isMemLoc()) {
3514 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3525 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3526 RC = &AMDGPU::VGPR_32RegClass;
3527 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3528 RC = &AMDGPU::SGPR_32RegClass;
3548 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3564 Info->setBytesInStackArgArea(StackArgSize);
3566 return Chains.
empty() ? Chain
3575 const Type *RetTy)
const {
3583 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3588 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3589 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3590 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3591 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3614 Info->setIfReturnsVoid(Outs.
empty());
3615 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3634 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3635 ++
I, ++RealRVLocIdx) {
3639 SDValue Arg = OutVals[RealRVLocIdx];
3662 ReadFirstLane, Arg);
3669 if (!Info->isEntryFunction()) {
3675 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3677 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3778 auto &ArgUsageInfo =
3780 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3806 const auto [OutgoingArg, ArgRC, ArgTy] =
3811 const auto [IncomingArg, IncomingArgRC, Ty] =
3813 assert(IncomingArgRC == ArgRC);
3816 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3824 InputReg = getImplicitArgPtr(DAG,
DL);
3826 std::optional<uint32_t> Id =
3828 if (Id.has_value()) {
3839 if (OutgoingArg->isRegister()) {
3840 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3841 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3844 unsigned SpecialArgOffset =
3855 auto [OutgoingArg, ArgRC, Ty] =
3858 std::tie(OutgoingArg, ArgRC, Ty) =
3861 std::tie(OutgoingArg, ArgRC, Ty) =
3876 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3877 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3878 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3883 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3891 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
3901 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
3910 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3911 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3922 : IncomingArgY ? *IncomingArgY
3929 if (OutgoingArg->isRegister()) {
3931 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3957 if (Callee->isDivergent())
3964 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3968 if (!CallerPreserved)
3971 bool CCMatch = CallerCC == CalleeCC;
3984 if (Arg.hasByValAttr())
3998 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3999 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4008 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4021 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4023 if (!CCVA.isRegLoc())
4028 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4030 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4054enum ChainCallArgIdx {
4076 bool UsesDynamicVGPRs =
false;
4077 if (IsChainCallConv) {
4082 auto RequestedExecIt =
4084 return Arg.OrigArgIndex == 2;
4086 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4088 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4091 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4094 "Haven't popped all the special args");
4097 CLI.
Args[ChainCallArgIdx::Exec];
4098 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4106 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4108 ChainCallSpecialArgs.
push_back(Arg.Node);
4111 PushNodeOrTargetConstant(RequestedExecArg);
4117 if (FlagsValue.
isZero()) {
4118 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4120 "no additional args allowed if flags == 0");
4122 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4126 if (!Subtarget->isWave32()) {
4128 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4131 UsesDynamicVGPRs =
true;
4132 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4133 CLI.
Args.end(), PushNodeOrTargetConstant);
4142 bool IsSibCall =
false;
4156 "unsupported call to variadic function ");
4164 "unsupported required tail call to function ");
4169 Outs, OutVals, Ins, DAG);
4173 "site marked musttail or on llvm.amdgcn.cs.chain");
4180 if (!TailCallOpt && IsTailCall)
4220 auto *
TRI = Subtarget->getRegisterInfo();
4227 if (!IsSibCall || IsChainCallConv) {
4228 if (!Subtarget->enableFlatScratch()) {
4234 RegsToPass.emplace_back(IsChainCallConv
4235 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4236 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4243 const unsigned NumSpecialInputs = RegsToPass.size();
4245 MVT PtrVT = MVT::i32;
4248 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4276 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4284 int32_t
Offset = LocMemOffset;
4291 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4297 ? Flags.getNonZeroByValAlign()
4324 if (Outs[i].Flags.isByVal()) {
4326 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4329 Outs[i].Flags.getNonZeroByValAlign(),
4331 nullptr, std::nullopt, DstInfo,
4337 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4343 if (!MemOpChains.
empty())
4351 TokenGlue = DAG.
getNode(ISD::CONVERGENCECTRL_GLUE,
DL, MVT::Glue,
4359 unsigned ArgIdx = 0;
4360 for (
auto [Reg, Val] : RegsToPass) {
4361 if (ArgIdx++ >= NumSpecialInputs &&
4362 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4388 if (IsTailCall && !IsSibCall) {
4393 std::vector<SDValue>
Ops({Chain});
4399 Ops.push_back(Callee);
4416 Ops.push_back(Callee);
4427 if (IsChainCallConv)
4432 for (
auto &[Reg, Val] : RegsToPass)
4436 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4437 assert(Mask &&
"Missing call preserved mask for calling convention");
4447 MVT::Glue, GlueOps),
4452 Ops.push_back(InGlue);
4472 if (Info->isWholeWaveFunction())
4480 Chain =
Call.getValue(0);
4481 InGlue =
Call.getValue(1);
4483 uint64_t CalleePopBytes = NumBytes;
4504 EVT VT =
Op.getValueType();
4518 "Stack grows upwards for AMDGPU");
4520 Chain = BaseAddr.getValue(1);
4522 if (Alignment > StackAlign) {
4524 << Subtarget->getWavefrontSizeLog2();
4525 uint64_t StackAlignMask = ScaledAlignment - 1;
4532 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4538 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4549 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4565 if (
Op.getValueType() != MVT::i32)
4584 assert(
Op.getValueType() == MVT::i32);
4593 Op.getOperand(0), IntrinID, GetRoundBothImm);
4627 SDValue RoundModeTimesNumBits =
4647 TableEntry, EnumOffset);
4663 static_cast<uint32_t>(ConstMode->getZExtValue()),
4675 if (UseReducedTable) {
4681 SDValue RoundModeTimesNumBits =
4701 SDValue RoundModeTimesNumBits =
4710 NewMode = TruncTable;
4719 ReadFirstLaneID, NewMode);
4732 IntrinID, RoundBothImm, NewMode);
4738 if (
Op->isDivergent() &&
4739 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4749 if (Subtarget->hasSafeSmemPrefetch())
4757 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4766 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4767 EVT SrcVT = Src.getValueType();
4776 EVT DstVT =
Op.getValueType();
4780 return DAG.
getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4785 if (
Op.getValueType() != MVT::i64)
4799 Op.getOperand(0), IntrinID, ModeHwRegImm);
4801 Op.getOperand(0), IntrinID, TrapHwRegImm);
4808 SDValue Result = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4815 if (
Op.getOperand(1).getValueType() != MVT::i64)
4827 ReadFirstLaneID, NewModeReg);
4829 ReadFirstLaneID, NewTrapReg);
4831 unsigned ModeHwReg =
4834 unsigned TrapHwReg =
4842 IntrinID, ModeHwRegImm, NewModeReg);
4845 IntrinID, TrapHwRegImm, NewTrapReg);
4854 .
Case(
"m0", AMDGPU::M0)
4855 .
Case(
"exec", AMDGPU::EXEC)
4856 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4857 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4858 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4859 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4860 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4865 if (!Subtarget->hasFlatScrRegister() &&
4866 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4868 "\" for subtarget."));
4873 case AMDGPU::EXEC_LO:
4874 case AMDGPU::EXEC_HI:
4875 case AMDGPU::FLAT_SCR_LO:
4876 case AMDGPU::FLAT_SCR_HI:
4881 case AMDGPU::FLAT_SCR:
4900 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4909static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4931 auto Next = std::next(
I);
4942 MBB.addSuccessor(LoopBB);
4944 return std::pair(LoopBB, RemainderBB);
4951 auto I =
MI.getIterator();
4952 auto E = std::next(
I);
4974 Src->setIsKill(
false);
4984 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
4990 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4993 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5017 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5018 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5027 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
5028 Register NewExec =
MRI.createVirtualRegister(BoolRC);
5030 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5031 Register CondReg =
MRI.createVirtualRegister(BoolRC);
5039 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5046 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5050 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5056 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
5057 : AMDGPU::S_AND_SAVEEXEC_B64),
5061 MRI.setSimpleHint(NewExec, CondReg);
5063 if (UseGPRIdxMode) {
5065 SGPRIdxReg = CurrentIdxReg;
5067 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5068 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5078 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5085 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5088 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
5089 : AMDGPU::S_XOR_B64_term),
5113 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5114 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5122 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5124 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
5125 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
5126 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5127 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5142 InitResultReg, DstReg, PhiReg, TmpExec,
5143 Offset, UseGPRIdxMode, SGPRIdxReg);
5149 LoopBB->removeSuccessor(RemainderBB);
5151 LoopBB->addSuccessor(LandingPad);
5162static std::pair<unsigned, int>
5166 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5171 return std::pair(AMDGPU::sub0,
Offset);
5211 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5228 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5229 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5238 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5241 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5245 if (UseGPRIdxMode) {
5252 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5265 MI.eraseFromParent();
5274 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5275 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5281 UseGPRIdxMode, SGPRIdxReg);
5285 if (UseGPRIdxMode) {
5287 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5289 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5294 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5299 MI.eraseFromParent();
5316 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5326 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5328 if (Idx->
getReg() == AMDGPU::NoRegister) {
5339 MI.eraseFromParent();
5344 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5348 if (UseGPRIdxMode) {
5352 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5361 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5362 TRI.getRegSizeInBits(*VecRC), 32,
false);
5368 MI.eraseFromParent();
5378 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5382 UseGPRIdxMode, SGPRIdxReg);
5385 if (UseGPRIdxMode) {
5387 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5389 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5395 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5396 TRI.getRegSizeInBits(*VecRC), 32,
false);
5397 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5403 MI.eraseFromParent();
5419 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5420 if (ST.hasScalarAddSub64()) {
5421 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5431 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5432 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5435 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5437 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5440 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5442 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5444 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5445 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5454 MI.eraseFromParent();
5460 case AMDGPU::S_MIN_U32:
5461 return std::numeric_limits<uint32_t>::max();
5462 case AMDGPU::S_MIN_I32:
5463 return std::numeric_limits<int32_t>::max();
5464 case AMDGPU::S_MAX_U32:
5465 return std::numeric_limits<uint32_t>::min();
5466 case AMDGPU::S_MAX_I32:
5467 return std::numeric_limits<int32_t>::min();
5468 case AMDGPU::S_ADD_I32:
5469 case AMDGPU::S_SUB_I32:
5470 case AMDGPU::S_OR_B32:
5471 case AMDGPU::S_XOR_B32:
5472 return std::numeric_limits<uint32_t>::min();
5473 case AMDGPU::S_AND_B32:
5474 return std::numeric_limits<uint32_t>::max();
5477 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5483 case AMDGPU::V_CMP_LT_U64_e64:
5484 return std::numeric_limits<uint64_t>::max();
5485 case AMDGPU::V_CMP_LT_I64_e64:
5486 return std::numeric_limits<int64_t>::max();
5487 case AMDGPU::V_CMP_GT_U64_e64:
5488 return std::numeric_limits<uint64_t>::min();
5489 case AMDGPU::V_CMP_GT_I64_e64:
5490 return std::numeric_limits<int64_t>::min();
5491 case AMDGPU::S_ADD_U64_PSEUDO:
5492 case AMDGPU::S_SUB_U64_PSEUDO:
5493 case AMDGPU::S_OR_B64:
5494 case AMDGPU::S_XOR_B64:
5495 return std::numeric_limits<uint64_t>::min();
5496 case AMDGPU::S_AND_B64:
5497 return std::numeric_limits<uint64_t>::max();
5500 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5505 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5506 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5507 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5508 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5509 Opc == AMDGPU::S_XOR_B32;
5523 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5528 case AMDGPU::S_MIN_U32:
5529 case AMDGPU::S_MIN_I32:
5530 case AMDGPU::S_MAX_U32:
5531 case AMDGPU::S_MAX_I32:
5532 case AMDGPU::S_AND_B32:
5533 case AMDGPU::S_OR_B32: {
5539 case AMDGPU::V_CMP_LT_U64_e64:
5540 case AMDGPU::V_CMP_LT_I64_e64:
5541 case AMDGPU::V_CMP_GT_U64_e64:
5542 case AMDGPU::V_CMP_GT_I64_e64:
5543 case AMDGPU::S_AND_B64:
5544 case AMDGPU::S_OR_B64: {
5550 case AMDGPU::S_XOR_B32:
5551 case AMDGPU::S_XOR_B64:
5552 case AMDGPU::S_ADD_I32:
5553 case AMDGPU::S_ADD_U64_PSEUDO:
5554 case AMDGPU::S_SUB_I32:
5555 case AMDGPU::S_SUB_U64_PSEUDO: {
5558 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5560 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5562 bool IsWave32 = ST.isWave32();
5563 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5564 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5565 unsigned BitCountOpc =
5566 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5570 auto NewAccumulator =
5575 case AMDGPU::S_XOR_B32:
5576 case AMDGPU::S_XOR_B64: {
5582 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5585 .
addReg(NewAccumulator->getOperand(0).getReg())
5588 if (
Opc == AMDGPU::S_XOR_B32) {
5594 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5596 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5600 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5603 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5605 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5615 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5623 case AMDGPU::S_SUB_I32: {
5624 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5632 .
addReg(NewAccumulator->getOperand(0).getReg());
5635 case AMDGPU::S_ADD_I32: {
5638 .
addReg(NewAccumulator->getOperand(0).getReg());
5641 case AMDGPU::S_ADD_U64_PSEUDO:
5642 case AMDGPU::S_SUB_U64_PSEUDO: {
5643 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5644 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5646 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5648 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5649 Register CarryReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5650 Register AddReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5652 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5654 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5658 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5661 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5663 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5665 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5668 .
addReg(NewAccumulator->getOperand(0).getReg())
5678 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5680 : NewAccumulator->getOperand(0).getReg();
5691 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5697 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5703 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5735 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5736 Register IdentityValReg =
MRI.createVirtualRegister(DstRegClass);
5737 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5738 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5739 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5740 Register FF1Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5741 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
5743 bool IsWave32 = ST.isWave32();
5744 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5745 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5752 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5756 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5765 I = ComputeLoop->begin();
5767 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5771 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5775 I = ComputeLoop->end();
5778 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5782 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5791 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5793 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5794 Register LaneValReg =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5797 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5799 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5801 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5803 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5807 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5811 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5812 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5818 case AMDGPU::S_OR_B64:
5819 case AMDGPU::S_AND_B64:
5820 case AMDGPU::S_XOR_B64: {
5823 .
addReg(LaneValue->getOperand(0).getReg())
5827 case AMDGPU::V_CMP_GT_I64_e64:
5828 case AMDGPU::V_CMP_GT_U64_e64:
5829 case AMDGPU::V_CMP_LT_I64_e64:
5830 case AMDGPU::V_CMP_LT_U64_e64: {
5831 Register LaneMaskReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5833 MRI.createVirtualRegister(WaveMaskRegClass);
5836 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5837 Register AccumulatorVReg =
MRI.createVirtualRegister(VregClass);
5840 VregClass, AMDGPU::sub0, VSubRegClass);
5843 VregClass, AMDGPU::sub1, VSubRegClass);
5844 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
5851 .
addReg(LaneValue->getOperand(0).getReg())
5852 .
addReg(AccumulatorVReg);
5854 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5855 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
5859 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5860 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5861 .
addReg(LaneValue->getOperand(0).getReg())
5865 case AMDGPU::S_ADD_U64_PSEUDO:
5866 case AMDGPU::S_SUB_U64_PSEUDO: {
5869 .
addReg(LaneValue->getOperand(0).getReg());
5876 unsigned BITSETOpc =
5877 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5878 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5884 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5887 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5889 .
addReg(NewActiveBitsReg)
5891 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5896 MI.eraseFromParent();
5908 switch (
MI.getOpcode()) {
5909 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5911 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5913 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5915 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5917 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5919 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5921 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5923 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5925 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5927 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5929 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5931 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5933 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5935 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5937 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5939 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5941 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5943 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5945 case AMDGPU::S_UADDO_PSEUDO:
5946 case AMDGPU::S_USUBO_PSEUDO: {
5953 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5955 : AMDGPU::S_SUB_I32;
5966 MI.eraseFromParent();
5969 case AMDGPU::S_ADD_U64_PSEUDO:
5970 case AMDGPU::S_SUB_U64_PSEUDO: {
5973 case AMDGPU::V_ADD_U64_PSEUDO:
5974 case AMDGPU::V_SUB_U64_PSEUDO: {
5980 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5986 if (ST.hasAddSubU64Insts()) {
5988 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5989 : AMDGPU::V_SUB_U64_e64),
5994 TII->legalizeOperands(*
I);
5995 MI.eraseFromParent();
5999 if (IsAdd && ST.hasLshlAddU64Inst()) {
6005 TII->legalizeOperands(*
Add);
6006 MI.eraseFromParent();
6010 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6012 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6013 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6015 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
6016 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
6020 : &AMDGPU::VReg_64RegClass;
6023 : &AMDGPU::VReg_64RegClass;
6026 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6028 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6031 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6033 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6036 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6038 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6041 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6048 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6062 TII->legalizeOperands(*LoHalf);
6063 TII->legalizeOperands(*HiHalf);
6064 MI.eraseFromParent();
6067 case AMDGPU::S_ADD_CO_PSEUDO:
6068 case AMDGPU::S_SUB_CO_PSEUDO: {
6082 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
6083 ? AMDGPU::S_ADDC_U32
6084 : AMDGPU::S_SUBB_U32;
6086 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6087 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6092 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6093 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6097 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6099 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6105 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
6106 assert(WaveSize == 64 || WaveSize == 32);
6108 if (WaveSize == 64) {
6109 if (ST.hasScalarCompareEq64()) {
6115 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6117 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6119 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6120 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6122 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6143 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6149 MI.eraseFromParent();
6152 case AMDGPU::SI_INIT_M0: {
6155 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6158 MI.eraseFromParent();
6161 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6164 TII->get(AMDGPU::S_CMP_EQ_U32))
6169 case AMDGPU::GET_GROUPSTATICSIZE: {
6174 .
add(
MI.getOperand(0))
6176 MI.eraseFromParent();
6179 case AMDGPU::GET_SHADERCYCLESHILO: {
6194 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6196 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6197 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6199 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6200 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6202 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6206 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6211 .
add(
MI.getOperand(0))
6216 MI.eraseFromParent();
6219 case AMDGPU::SI_INDIRECT_SRC_V1:
6220 case AMDGPU::SI_INDIRECT_SRC_V2:
6221 case AMDGPU::SI_INDIRECT_SRC_V4:
6222 case AMDGPU::SI_INDIRECT_SRC_V8:
6223 case AMDGPU::SI_INDIRECT_SRC_V9:
6224 case AMDGPU::SI_INDIRECT_SRC_V10:
6225 case AMDGPU::SI_INDIRECT_SRC_V11:
6226 case AMDGPU::SI_INDIRECT_SRC_V12:
6227 case AMDGPU::SI_INDIRECT_SRC_V16:
6228 case AMDGPU::SI_INDIRECT_SRC_V32:
6230 case AMDGPU::SI_INDIRECT_DST_V1:
6231 case AMDGPU::SI_INDIRECT_DST_V2:
6232 case AMDGPU::SI_INDIRECT_DST_V4:
6233 case AMDGPU::SI_INDIRECT_DST_V8:
6234 case AMDGPU::SI_INDIRECT_DST_V9:
6235 case AMDGPU::SI_INDIRECT_DST_V10:
6236 case AMDGPU::SI_INDIRECT_DST_V11:
6237 case AMDGPU::SI_INDIRECT_DST_V12:
6238 case AMDGPU::SI_INDIRECT_DST_V16:
6239 case AMDGPU::SI_INDIRECT_DST_V32:
6241 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6242 case AMDGPU::SI_KILL_I1_PSEUDO:
6244 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6253 Register SrcCond =
MI.getOperand(3).getReg();
6255 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6256 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6257 const auto *CondRC =
TRI->getWaveMaskRegClass();
6258 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
6262 : &AMDGPU::VReg_64RegClass;
6265 : &AMDGPU::VReg_64RegClass;
6268 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6270 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6273 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6275 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6278 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6280 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6301 MI.eraseFromParent();
6304 case AMDGPU::SI_BR_UNDEF: {
6308 .
add(
MI.getOperand(0));
6310 MI.eraseFromParent();
6313 case AMDGPU::ADJCALLSTACKUP:
6314 case AMDGPU::ADJCALLSTACKDOWN: {
6321 case AMDGPU::SI_CALL_ISEL: {
6325 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
6328 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6334 MI.eraseFromParent();
6337 case AMDGPU::V_ADD_CO_U32_e32:
6338 case AMDGPU::V_SUB_CO_U32_e32:
6339 case AMDGPU::V_SUBREV_CO_U32_e32: {
6342 unsigned Opc =
MI.getOpcode();
6344 bool NeedClampOperand =
false;
6345 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
6347 NeedClampOperand =
true;
6351 if (
TII->isVOP3(*
I)) {
6356 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
6357 if (NeedClampOperand)
6360 TII->legalizeOperands(*
I);
6362 MI.eraseFromParent();
6365 case AMDGPU::V_ADDC_U32_e32:
6366 case AMDGPU::V_SUBB_U32_e32:
6367 case AMDGPU::V_SUBBREV_U32_e32:
6370 TII->legalizeOperands(
MI);
6372 case AMDGPU::DS_GWS_INIT:
6373 case AMDGPU::DS_GWS_SEMA_BR:
6374 case AMDGPU::DS_GWS_BARRIER:
6375 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
6377 case AMDGPU::DS_GWS_SEMA_V:
6378 case AMDGPU::DS_GWS_SEMA_P:
6379 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6387 case AMDGPU::S_SETREG_B32: {
6403 const unsigned SetMask = WidthMask <<
Offset;
6406 unsigned SetDenormOp = 0;
6407 unsigned SetRoundOp = 0;
6415 SetRoundOp = AMDGPU::S_ROUND_MODE;
6416 SetDenormOp = AMDGPU::S_DENORM_MODE;
6418 SetRoundOp = AMDGPU::S_ROUND_MODE;
6420 SetDenormOp = AMDGPU::S_DENORM_MODE;
6423 if (SetRoundOp || SetDenormOp) {
6426 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6427 unsigned ImmVal = Def->getOperand(1).getImm();
6441 MI.eraseFromParent();
6450 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6454 case AMDGPU::S_INVERSE_BALLOT_U32:
6455 case AMDGPU::S_INVERSE_BALLOT_U64:
6458 MI.setDesc(
TII->get(AMDGPU::COPY));
6460 case AMDGPU::ENDPGM_TRAP: {
6463 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6483 MI.eraseFromParent();
6486 case AMDGPU::SIMULATED_TRAP: {
6487 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6490 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6491 MI.eraseFromParent();
6494 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6495 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6501 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6502 Register OriginalExec = Setup->getOperand(0).getReg();
6504 MI.getOperand(0).setReg(OriginalExec);
6541 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6545 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6572 if (!Subtarget->hasMadMacF32Insts())
6573 return Subtarget->hasFastFMAF32();
6579 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6582 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6598 switch (Ty.getScalarSizeInBits()) {
6616 if (Ty.getScalarSizeInBits() == 16)
6618 if (Ty.getScalarSizeInBits() == 32)
6619 return Subtarget->hasMadMacF32Insts() &&
6629 EVT VT =
N->getValueType(0);
6631 return Subtarget->hasMadMacF32Insts() &&
6633 if (VT == MVT::f16) {
6634 return Subtarget->hasMadF16() &&
6649 unsigned Opc =
Op.getOpcode();
6650 EVT VT =
Op.getValueType();
6651 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6652 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6653 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6654 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6670 [[maybe_unused]]
EVT VT =
Op.getValueType();
6672 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6673 VT == MVT::v16i32) &&
6674 "Unexpected ValueType.");
6683 unsigned Opc =
Op.getOpcode();
6684 EVT VT =
Op.getValueType();
6685 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6686 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6687 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6688 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6689 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6690 VT == MVT::v32bf16);
6698 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6700 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6707 unsigned Opc =
Op.getOpcode();
6708 EVT VT =
Op.getValueType();
6709 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6710 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6711 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6712 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6713 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6714 VT == MVT::v32bf16);
6719 : std::pair(Op0, Op0);
6728 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
6730 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
6736 switch (
Op.getOpcode()) {
6740 return LowerBRCOND(
Op, DAG);
6742 return LowerRETURNADDR(
Op, DAG);
6745 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6746 "Load should return a value and a chain");
6750 EVT VT =
Op.getValueType();
6752 return lowerFSQRTF32(
Op, DAG);
6754 return lowerFSQRTF64(
Op, DAG);
6759 return LowerTrig(
Op, DAG);
6761 return LowerSELECT(
Op, DAG);
6763 return LowerFDIV(
Op, DAG);
6765 return LowerFFREXP(
Op, DAG);
6766 case ISD::ATOMIC_CMP_SWAP:
6767 return LowerATOMIC_CMP_SWAP(
Op, DAG);
6769 return LowerSTORE(
Op, DAG);
6773 return LowerGlobalAddress(MFI,
Op, DAG);
6776 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
6778 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
6780 return LowerINTRINSIC_VOID(
Op, DAG);
6781 case ISD::ADDRSPACECAST:
6782 return lowerADDRSPACECAST(
Op, DAG);
6784 return lowerINSERT_SUBVECTOR(
Op, DAG);
6786 return lowerINSERT_VECTOR_ELT(
Op, DAG);
6788 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
6790 return lowerVECTOR_SHUFFLE(
Op, DAG);
6792 return lowerSCALAR_TO_VECTOR(
Op, DAG);
6794 return lowerBUILD_VECTOR(
Op, DAG);
6797 return lowerFP_ROUND(
Op, DAG);
6799 return lowerTRAP(
Op, DAG);
6800 case ISD::DEBUGTRAP:
6801 return lowerDEBUGTRAP(
Op, DAG);
6810 return lowerFMINNUM_FMAXNUM(
Op, DAG);
6811 case ISD::FMINIMUMNUM:
6812 case ISD::FMAXIMUMNUM:
6813 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
6816 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
6819 return lowerFLDEXP(
Op, DAG);
6836 case ISD::FMINNUM_IEEE:
6837 case ISD::FMAXNUM_IEEE:
6844 return lowerFCOPYSIGN(
Op, DAG);
6846 return lowerMUL(
Op, DAG);
6849 return lowerXMULO(
Op, DAG);
6852 return lowerXMUL_LOHI(
Op, DAG);
6853 case ISD::DYNAMIC_STACKALLOC:
6855 case ISD::STACKSAVE:
6859 case ISD::SET_ROUNDING:
6863 case ISD::FP_EXTEND:
6866 case ISD::GET_FPENV:
6868 case ISD::SET_FPENV:
6887 EVT FittingLoadVT = LoadVT;
6912 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6916 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6919SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6922 bool IsIntrinsic)
const {
6925 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6926 EVT LoadVT =
M->getValueType(0);
6928 EVT EquivLoadVT = LoadVT;
6942 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
6946 M->getMemoryVT(),
M->getMemOperand());
6957 EVT LoadVT =
M->getValueType(0);
6963 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
6964 bool IsTFE =
M->getNumValues() == 3;
6977 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
6981 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
6982 M->getMemOperand(), DAG);
6986 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
6988 M->getMemOperand(), DAG);
6996 EVT VT =
N->getValueType(0);
6997 unsigned CondCode =
N->getConstantOperandVal(3);
7008 EVT CmpVT =
LHS.getValueType();
7009 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
7010 unsigned PromoteOp =
7030 EVT VT =
N->getValueType(0);
7032 unsigned CondCode =
N->getConstantOperandVal(3);
7041 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7042 Src0 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7043 Src1 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7059 EVT VT =
N->getValueType(0);
7066 Src.getOperand(1), Src.getOperand(2));
7077 Exec = AMDGPU::EXEC_LO;
7079 Exec = AMDGPU::EXEC;
7096 EVT VT =
N->getValueType(0);
7098 unsigned IID =
N->getConstantOperandVal(0);
7099 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7100 IID == Intrinsic::amdgcn_permlanex16;
7101 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7102 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7106 unsigned SplitSize = 32;
7107 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7108 ST->hasDPALU_DPP() &&
7116 case Intrinsic::amdgcn_permlane16:
7117 case Intrinsic::amdgcn_permlanex16:
7118 case Intrinsic::amdgcn_update_dpp:
7123 case Intrinsic::amdgcn_writelane:
7126 case Intrinsic::amdgcn_readlane:
7127 case Intrinsic::amdgcn_set_inactive:
7128 case Intrinsic::amdgcn_set_inactive_chain_arg:
7129 case Intrinsic::amdgcn_mov_dpp8:
7132 case Intrinsic::amdgcn_readfirstlane:
7133 case Intrinsic::amdgcn_permlane64:
7143 if (
SDNode *GL =
N->getGluedNode()) {
7144 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7145 GL = GL->getOperand(0).getNode();
7146 Operands.push_back(DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7155 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7156 IID == Intrinsic::amdgcn_mov_dpp8 ||
7157 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7158 Src1 =
N->getOperand(2);
7159 if (IID == Intrinsic::amdgcn_writelane ||
7160 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7161 Src2 =
N->getOperand(3);
7164 if (ValSize == SplitSize) {
7174 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7179 if (IID == Intrinsic::amdgcn_writelane) {
7184 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7186 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7189 if (ValSize % SplitSize != 0)
7193 EVT VT =
N->getValueType(0);
7197 unsigned NumOperands =
N->getNumOperands();
7199 SDNode *GL =
N->getGluedNode();
7204 for (
unsigned i = 0; i != NE; ++i) {
7205 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7207 SDValue Operand =
N->getOperand(j);
7222 DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7237 if (SplitSize == 32) {
7239 return unrollLaneOp(LaneOp.
getNode());
7245 unsigned SubVecNumElt =
7249 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7250 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7254 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7259 if (IID == Intrinsic::amdgcn_writelane)
7264 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7265 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7266 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7267 EltIdx += SubVecNumElt;
7281 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7284 if (IID == Intrinsic::amdgcn_writelane)
7287 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7295 switch (
N->getOpcode()) {
7307 unsigned IID =
N->getConstantOperandVal(0);
7309 case Intrinsic::amdgcn_make_buffer_rsrc:
7310 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
7312 case Intrinsic::amdgcn_cvt_pkrtz: {
7318 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7321 case Intrinsic::amdgcn_cvt_pknorm_i16:
7322 case Intrinsic::amdgcn_cvt_pknorm_u16:
7323 case Intrinsic::amdgcn_cvt_pk_i16:
7324 case Intrinsic::amdgcn_cvt_pk_u16: {
7330 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7332 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7334 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7339 EVT VT =
N->getValueType(0);
7344 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7348 case Intrinsic::amdgcn_s_buffer_load: {
7354 if (!Subtarget->hasScalarSubwordLoads())
7360 EVT VT =
Op.getValueType();
7361 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
7373 if (!
Offset->isDivergent()) {
7392 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
7397 case Intrinsic::amdgcn_dead: {
7398 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7409 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7410 Results.push_back(Res.getOperand(
I));
7414 Results.push_back(Res.getValue(1));
7423 EVT VT =
N->getValueType(0);
7428 EVT SelectVT = NewVT;
7429 if (NewVT.
bitsLT(MVT::i32)) {
7432 SelectVT = MVT::i32;
7438 if (NewVT != SelectVT)
7444 if (
N->getValueType(0) != MVT::v2f16)
7448 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7456 if (
N->getValueType(0) != MVT::v2f16)
7460 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7468 if (
N->getValueType(0) != MVT::f16)
7483 if (U.get() !=
Value)
7486 if (U.getUser()->getOpcode() == Opcode)
7492unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
7495 case Intrinsic::amdgcn_if:
7497 case Intrinsic::amdgcn_else:
7499 case Intrinsic::amdgcn_loop:
7501 case Intrinsic::amdgcn_end_cf:
7521 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7548 SDNode *Intr = BRCOND.getOperand(1).getNode();
7561 assert(BR &&
"brcond missing unconditional branch user");
7565 unsigned CFNode = isCFIntrinsic(Intr);
7585 Ops.push_back(Target);
7608 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
7627 MVT VT =
Op.getSimpleValueType();
7630 if (
Op.getConstantOperandVal(0) != 0)
7634 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7636 if (
Info->isEntryFunction())
7653 return Op.getValueType().bitsLE(VT)
7661 EVT DstVT =
Op.getValueType();
7668 unsigned Opc =
Op.getOpcode();
7680 EVT SrcVT = Src.getValueType();
7681 EVT DstVT =
Op.getValueType();
7684 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
7687 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
7694 if (DstVT == MVT::f16) {
7699 if (!Subtarget->has16BitInsts()) {
7702 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7704 if (
Op->getFlags().hasApproximateFuncs()) {
7711 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7715 "custom lower FP_ROUND for f16 or bf16");
7716 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
7729 EVT VT =
Op.getValueType();
7731 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7732 bool IsIEEEMode =
Info->getMode().IEEE;
7741 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7748SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
7750 EVT VT =
Op.getValueType();
7752 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7753 bool IsIEEEMode =
Info->getMode().IEEE;
7758 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7766 EVT VT =
Op.getValueType();
7770 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7771 !Subtarget->hasMinimum3Maximum3F16() &&
7772 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7773 "should not need to widen f16 minimum/maximum to v2f16");
7787 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7795 EVT VT =
Op.getValueType();
7799 EVT ExpVT =
Exp.getValueType();
7800 if (ExpVT == MVT::i16)
7821 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
7824 return DAG.
getNode(ISD::FLDEXP,
DL, VT,
Op.getOperand(0), TruncExp);
7828 switch (
Op->getOpcode()) {
7858 DAGCombinerInfo &DCI)
const {
7859 const unsigned Opc =
Op.getOpcode();
7867 :
Op->getOperand(0).getValueType();
7870 if (DCI.isBeforeLegalizeOps() ||
7874 auto &DAG = DCI.DAG;
7880 LHS =
Op->getOperand(1);
7881 RHS =
Op->getOperand(2);
7883 LHS =
Op->getOperand(0);
7884 RHS =
Op->getOperand(1);
7923 if (MagVT == SignVT)
7930 SDValue SignAsInt32 = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7933 SDValue SignAsHalf16 = DAG.
getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7940 EVT VT =
Op.getValueType();
7946 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
7973 if (
Op->isDivergent())
7986 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7988 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7991 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7993 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7999 EVT VT =
Op.getValueType();
8006 const APInt &
C = RHSC->getAPIntValue();
8008 if (
C.isPowerOf2()) {
8010 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
8037 if (
Op->isDivergent()) {
8041 if (Subtarget->hasSMulHi()) {
8052 if (!Subtarget->isTrapHandlerEnabled() ||
8054 return lowerTrapEndpgm(
Op, DAG);
8056 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
8057 : lowerTrapHsaQueuePtr(
Op, DAG);
8067SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
8069 ImplicitParameter Param)
const {
8089 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
8092 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8095 if (UserSGPR == AMDGPU::NoRegister) {
8121 if (Subtarget->hasPrivEnabledTrap2NopBug())
8134 if (!Subtarget->isTrapHandlerEnabled() ||
8138 "debugtrap handler not supported",
8149SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
8151 if (Subtarget->hasApertureRegs()) {
8153 ? AMDGPU::SRC_SHARED_BASE
8154 : AMDGPU::SRC_PRIVATE_BASE;
8155 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8156 !Subtarget->hasGloballyAddressableScratch()) &&
8157 "Cannot use src_private_base with globally addressable scratch!");
8180 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
8189 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
8193 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8195 if (UserSGPR == AMDGPU::NoRegister) {
8229 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8240 const AMDGPUTargetMachine &TM =
8243 unsigned DestAS, SrcAS;
8245 bool IsNonNull =
false;
8247 SrcAS = ASC->getSrcAddressSpace();
8248 Src = ASC->getOperand(0);
8249 DestAS = ASC->getDestAddressSpace();
8252 Op.getConstantOperandVal(0) ==
8253 Intrinsic::amdgcn_addrspacecast_nonnull);
8254 Src =
Op->getOperand(1);
8255 SrcAS =
Op->getConstantOperandVal(2);
8256 DestAS =
Op->getConstantOperandVal(3);
8269 Subtarget->hasGloballyAddressableScratch()) {
8274 AMDGPU::S_MOV_B32, SL, MVT::i32,
8275 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8283 unsigned NullVal = TM.getNullPointerValue(DestAS);
8298 Subtarget->hasGloballyAddressableScratch()) {
8307 if (Subtarget->isWave64())
8313 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8316 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8321 AMDGPU::S_MOV_B64, SL, MVT::i64,
8322 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8324 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8326 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8328 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8334 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8346 Op.getValueType() == MVT::i64) {
8347 const SIMachineFunctionInfo *
Info =
8351 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8355 Src.getValueType() == MVT::i64)
8375 EVT InsVT =
Ins.getValueType();
8383 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
8388 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8390 MVT::i32, InsNumElts / 2);
8392 Vec = DAG.
getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8393 Ins = DAG.
getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8395 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
8397 if (InsNumElts == 2) {
8407 return DAG.
getNode(ISD::BITCAST, SL, VecVT, Vec);
8410 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
8433 if (NumElts == 4 && EltSize == 16 && KIdx) {
8441 SDValue LoVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8442 SDValue HiVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8444 unsigned Idx = KIdx->getZExtValue();
8445 bool InsertLo = Idx < 2;
8448 DAG.
getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8449 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8451 InsHalf = DAG.
getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8455 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8468 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
8496 return DAG.
getNode(ISD::BITCAST, SL, VecVT, BFI);
8503 EVT ResultVT =
Op.getValueType();
8516 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8519 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8523 if (VecSize == 128) {
8531 }
else if (VecSize == 256) {
8534 for (
unsigned P = 0;
P < 4; ++
P) {
8540 Parts[0], Parts[1]));
8542 Parts[2], Parts[3]));
8548 for (
unsigned P = 0;
P < 8; ++
P) {
8555 Parts[0], Parts[1], Parts[2], Parts[3]));
8558 Parts[4], Parts[5], Parts[6], Parts[7]));
8578 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8593 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8595 return DAG.
getNode(ISD::BITCAST, SL, ResultVT, Result);
8603 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8608 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8609 !(Mask[Elt + 1] & 1);
8615 EVT ResultVT =
Op.getValueType();
8618 const int NewSrcNumElts = 2;
8620 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
8636 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
8658 if (ShouldUseConsecutiveExtract &&
8661 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8662 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8674 if (Idx0 >= SrcNumElts) {
8679 if (Idx1 >= SrcNumElts) {
8684 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8685 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8693 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8694 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8699 if (SubVec0 != SubVec1) {
8700 NewMaskIdx1 += NewSrcNumElts;
8707 {NewMaskIdx0, NewMaskIdx1});
8712 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8713 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8714 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8715 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8734 EVT ResultVT =
Op.getValueType();
8750 EVT VT =
Op.getValueType();
8752 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8753 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
8762 return DAG.
getNode(ISD::BITCAST, SL, VT, ExtLo);
8771 return DAG.
getNode(ISD::BITCAST, SL, VT, ShlHi);
8778 return DAG.
getNode(ISD::BITCAST, SL, VT,
Or);
8787 for (
unsigned P = 0;
P < NumParts; ++
P) {
8789 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
8795 return DAG.
getNode(ISD::BITCAST, SL, VT, Blend);
8808 if (!Subtarget->isAmdHsaOS())
8868 EVT PtrVT =
Op.getValueType();
8870 const GlobalValue *GV = GSD->
getGlobal();
8884 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
8902 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8903 if (Subtarget->has64BitLiterals()) {
8934 MachinePointerInfo PtrInfo =
8962 SDValue Param = lowerKernargMemParameter(
8973 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
8981 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
8989 unsigned NumElts = Elts.
size();
8991 if (NumElts <= 12) {
9000 for (
unsigned i = 0; i < Elts.
size(); ++i) {
9006 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
9016 EVT SrcVT = Src.getValueType();
9037 bool Unpacked,
bool IsD16,
int DMaskPop,
9038 int NumVDataDwords,
bool IsAtomicPacked16Bit,
9042 EVT ReqRetVT = ResultTypes[0];
9044 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9045 ? (ReqRetNumElts + 1) / 2
9048 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9059 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
9070 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
9072 NumDataDwords - MaskPopDwords);
9077 EVT LegalReqRetVT = ReqRetVT;
9079 if (!
Data.getValueType().isInteger())
9081 Data.getValueType().changeTypeToInteger(),
Data);
9102 if (Result->getNumValues() == 1)
9109 SDValue *LWE,
bool &IsTexFail) {
9129 unsigned DimIdx,
unsigned EndIdx,
9130 unsigned NumGradients) {
9132 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
9140 if (((
I + 1) >= EndIdx) ||
9141 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
9142 I == DimIdx + NumGradients - 1))) {
9161 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9175 int NumVDataDwords = 0;
9176 bool AdjustRetType =
false;
9177 bool IsAtomicPacked16Bit =
false;
9180 const unsigned ArgOffset = WithChain ? 2 : 1;
9183 unsigned DMaskLanes = 0;
9185 if (BaseOpcode->Atomic) {
9186 VData =
Op.getOperand(2);
9188 IsAtomicPacked16Bit =
9189 (Intr->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9190 Intr->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
9193 if (BaseOpcode->AtomicX2) {
9200 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9201 DMask = Is64Bit ? 0xf : 0x3;
9202 NumVDataDwords = Is64Bit ? 4 : 2;
9204 DMask = Is64Bit ? 0x3 : 0x1;
9205 NumVDataDwords = Is64Bit ? 2 : 1;
9208 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
9211 if (BaseOpcode->Store) {
9212 VData =
Op.getOperand(2);
9216 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9220 VData = handleD16VData(VData, DAG,
true);
9223 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
9224 }
else if (!BaseOpcode->NoReturn) {
9229 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9237 (!LoadVT.
isVector() && DMaskLanes > 1))
9243 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9244 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9245 NumVDataDwords = (DMaskLanes + 1) / 2;
9247 NumVDataDwords = DMaskLanes;
9249 AdjustRetType =
true;
9253 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
9260 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9261 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9263 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
9265 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9266 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9270 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
9276 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
9280 "Bias needs to be converted to 16 bit in A16 mode");
9285 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
9289 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
9290 "require 16 bit args for both gradients and addresses");
9295 if (!
ST->hasA16()) {
9296 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
9297 "support 16 bit addresses\n");
9307 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
9309 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9311 IntrOpcode = G16MappingInfo->
G16;
9334 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
9352 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
9353 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
9354 const bool UseNSA =
ST->hasNSAEncoding() &&
9355 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
9356 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
9357 const bool UsePartialNSA =
9358 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
9361 if (UsePartialNSA) {
9363 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9364 }
else if (!UseNSA) {
9371 if (!BaseOpcode->Sampler) {
9374 uint64_t UnormConst =
9375 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
9377 Unorm = UnormConst ? True : False;
9383 bool IsTexFail =
false;
9384 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9395 NumVDataDwords += 1;
9396 AdjustRetType =
true;
9401 if (AdjustRetType) {
9404 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9413 MVT::i32, NumVDataDwords)
9416 ResultTypes[0] = NewVT;
9417 if (ResultTypes.size() == 3) {
9421 ResultTypes.erase(&ResultTypes[1]);
9426 if (BaseOpcode->Atomic)
9433 if (BaseOpcode->Store || BaseOpcode->Atomic)
9434 Ops.push_back(VData);
9435 if (UsePartialNSA) {
9437 Ops.push_back(VAddr);
9441 Ops.push_back(VAddr);
9444 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9446 Ops.push_back(Rsrc);
9447 if (BaseOpcode->Sampler) {
9451 Ops.push_back(Samp);
9456 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9457 Ops.push_back(Unorm);
9459 Ops.push_back(IsA16 &&
9460 ST->hasFeature(AMDGPU::FeatureR128A16)
9464 Ops.push_back(IsA16 ? True : False);
9466 if (!Subtarget->hasGFX90AInsts())
9471 "TFE is not supported on this GPU",
DL.getDebugLoc()));
9474 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9477 Ops.push_back(DimInfo->
DA ? True : False);
9478 if (BaseOpcode->HasD16)
9479 Ops.push_back(IsD16 ? True : False);
9481 Ops.push_back(
Op.getOperand(0));
9483 int NumVAddrDwords =
9489 NumVDataDwords, NumVAddrDwords);
9490 }
else if (IsGFX11Plus) {
9492 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9493 : AMDGPU::MIMGEncGfx11Default,
9494 NumVDataDwords, NumVAddrDwords);
9495 }
else if (IsGFX10Plus) {
9497 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9498 : AMDGPU::MIMGEncGfx10Default,
9499 NumVDataDwords, NumVAddrDwords);
9501 if (Subtarget->hasGFX90AInsts()) {
9503 NumVDataDwords, NumVAddrDwords);
9507 "requested image instruction is not supported on this GPU",
9512 for (EVT VT : OrigResultTypes) {
9513 if (VT == MVT::Other)
9514 RetValues[Idx++] =
Op.getOperand(0);
9525 NumVDataDwords, NumVAddrDwords);
9528 NumVDataDwords, NumVAddrDwords);
9535 MachineMemOperand *MemRef = MemOp->getMemOperand();
9539 if (BaseOpcode->AtomicX2) {
9544 if (BaseOpcode->NoReturn)
9547 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9548 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9561 MachinePointerInfo(),
9566 if (!
Offset->isDivergent()) {
9573 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9582 !Subtarget->hasScalarDwordx3Loads()) {
9609 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9611 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
9615 unsigned NumLoads = 1;
9621 if (NumElts == 8 || NumElts == 16) {
9622 NumLoads = NumElts / 4;
9626 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
9631 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
9633 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
9634 for (
unsigned i = 0; i < NumLoads; ++i) {
9640 if (NumElts == 8 || NumElts == 16)
9648 if (!Subtarget->hasArchitectedSGPRs())
9660 unsigned Width)
const {
9662 using namespace AMDGPU::Hwreg;
9664 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9703 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
9705 EVT VT =
Op.getValueType();
9707 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
9711 switch (IntrinsicID) {
9712 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9715 return getPreloadedValue(DAG, *MFI, VT,
9718 case Intrinsic::amdgcn_dispatch_ptr:
9719 case Intrinsic::amdgcn_queue_ptr: {
9720 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
9722 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
9727 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9730 return getPreloadedValue(DAG, *MFI, VT, RegID);
9732 case Intrinsic::amdgcn_implicitarg_ptr: {
9734 return getImplicitArgPtr(DAG,
DL);
9735 return getPreloadedValue(DAG, *MFI, VT,
9738 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9744 return getPreloadedValue(DAG, *MFI, VT,
9747 case Intrinsic::amdgcn_dispatch_id: {
9750 case Intrinsic::amdgcn_rcp:
9752 case Intrinsic::amdgcn_rsq:
9754 case Intrinsic::amdgcn_rsq_legacy:
9758 case Intrinsic::amdgcn_rcp_legacy:
9762 case Intrinsic::amdgcn_rsq_clamp: {
9773 return DAG.
getNode(ISD::FMAXNUM,
DL, VT, Tmp,
9776 case Intrinsic::r600_read_ngroups_x:
9777 if (Subtarget->isAmdHsaOS())
9780 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9783 case Intrinsic::r600_read_ngroups_y:
9784 if (Subtarget->isAmdHsaOS())
9787 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9790 case Intrinsic::r600_read_ngroups_z:
9791 if (Subtarget->isAmdHsaOS())
9794 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9797 case Intrinsic::r600_read_local_size_x:
9798 if (Subtarget->isAmdHsaOS())
9801 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9803 case Intrinsic::r600_read_local_size_y:
9804 if (Subtarget->isAmdHsaOS())
9807 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9809 case Intrinsic::r600_read_local_size_z:
9810 if (Subtarget->isAmdHsaOS())
9813 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9815 case Intrinsic::amdgcn_workgroup_id_x:
9816 return lowerWorkGroupId(DAG, *MFI, VT,
9820 case Intrinsic::amdgcn_workgroup_id_y:
9821 return lowerWorkGroupId(DAG, *MFI, VT,
9825 case Intrinsic::amdgcn_workgroup_id_z:
9826 return lowerWorkGroupId(DAG, *MFI, VT,
9830 case Intrinsic::amdgcn_cluster_id_x:
9831 return Subtarget->hasClusters()
9832 ? getPreloadedValue(DAG, *MFI, VT,
9834 : DAG.getPOISON(VT);
9835 case Intrinsic::amdgcn_cluster_id_y:
9836 return Subtarget->hasClusters()
9837 ? getPreloadedValue(DAG, *MFI, VT,
9840 case Intrinsic::amdgcn_cluster_id_z:
9841 return Subtarget->hasClusters()
9842 ? getPreloadedValue(DAG, *MFI, VT,
9845 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9846 return Subtarget->hasClusters()
9847 ? getPreloadedValue(
9851 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9852 return Subtarget->hasClusters()
9853 ? getPreloadedValue(
9857 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9858 return Subtarget->hasClusters()
9859 ? getPreloadedValue(
9863 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9864 return Subtarget->hasClusters()
9867 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9868 return Subtarget->hasClusters()
9869 ? getPreloadedValue(
9873 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9874 return Subtarget->hasClusters()
9875 ? getPreloadedValue(
9879 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9880 return Subtarget->hasClusters()
9881 ? getPreloadedValue(
9885 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9886 return Subtarget->hasClusters()
9887 ? getPreloadedValue(
9891 case Intrinsic::amdgcn_wave_id:
9892 return lowerWaveID(DAG,
Op);
9893 case Intrinsic::amdgcn_lds_kernel_id: {
9895 return getLDSKernelId(DAG,
DL);
9896 return getPreloadedValue(DAG, *MFI, VT,
9899 case Intrinsic::amdgcn_workitem_id_x:
9900 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
9901 case Intrinsic::amdgcn_workitem_id_y:
9902 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
9903 case Intrinsic::amdgcn_workitem_id_z:
9904 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
9905 case Intrinsic::amdgcn_wavefrontsize:
9907 SDLoc(
Op), MVT::i32);
9908 case Intrinsic::amdgcn_s_buffer_load: {
9909 unsigned CPol =
Op.getConstantOperandVal(3);
9916 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
9917 Op.getOperand(3), DAG);
9919 case Intrinsic::amdgcn_fdiv_fast:
9920 return lowerFDIV_FAST(
Op, DAG);
9921 case Intrinsic::amdgcn_sin:
9924 case Intrinsic::amdgcn_cos:
9927 case Intrinsic::amdgcn_mul_u24:
9930 case Intrinsic::amdgcn_mul_i24:
9934 case Intrinsic::amdgcn_log_clamp: {
9940 case Intrinsic::amdgcn_fract:
9943 case Intrinsic::amdgcn_class:
9946 case Intrinsic::amdgcn_div_fmas:
9948 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9950 case Intrinsic::amdgcn_div_fixup:
9952 Op.getOperand(2),
Op.getOperand(3));
9954 case Intrinsic::amdgcn_div_scale: {
9967 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
9970 Denominator, Numerator);
9972 case Intrinsic::amdgcn_icmp: {
9974 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
9975 Op.getConstantOperandVal(2) == 0 &&
9980 case Intrinsic::amdgcn_fcmp: {
9983 case Intrinsic::amdgcn_ballot:
9985 case Intrinsic::amdgcn_fmed3:
9987 Op.getOperand(2),
Op.getOperand(3));
9988 case Intrinsic::amdgcn_fdot2:
9990 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9991 case Intrinsic::amdgcn_fmul_legacy:
9994 case Intrinsic::amdgcn_sffbh:
9996 case Intrinsic::amdgcn_sbfe:
9998 Op.getOperand(2),
Op.getOperand(3));
9999 case Intrinsic::amdgcn_ubfe:
10001 Op.getOperand(2),
Op.getOperand(3));
10002 case Intrinsic::amdgcn_cvt_pkrtz:
10003 case Intrinsic::amdgcn_cvt_pknorm_i16:
10004 case Intrinsic::amdgcn_cvt_pknorm_u16:
10005 case Intrinsic::amdgcn_cvt_pk_i16:
10006 case Intrinsic::amdgcn_cvt_pk_u16: {
10008 EVT VT =
Op.getValueType();
10011 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10013 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10015 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10017 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10023 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
10026 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
10027 return DAG.
getNode(ISD::BITCAST,
DL, VT, Node);
10029 case Intrinsic::amdgcn_fmad_ftz:
10031 Op.getOperand(2),
Op.getOperand(3));
10033 case Intrinsic::amdgcn_if_break:
10035 Op->getOperand(1),
Op->getOperand(2)),
10038 case Intrinsic::amdgcn_groupstaticsize: {
10044 const GlobalValue *GV =
10050 case Intrinsic::amdgcn_is_shared:
10051 case Intrinsic::amdgcn_is_private: {
10054 DAG.
getNode(ISD::BITCAST,
DL, MVT::v2i32,
Op.getOperand(1));
10058 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10062 Subtarget->hasGloballyAddressableScratch()) {
10065 AMDGPU::S_MOV_B32,
DL, MVT::i32,
10066 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10075 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10078 case Intrinsic::amdgcn_perm:
10080 Op.getOperand(2),
Op.getOperand(3));
10081 case Intrinsic::amdgcn_reloc_constant: {
10091 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10092 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10093 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10094 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10095 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10096 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10097 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10098 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10099 if (
Op.getOperand(4).getValueType() == MVT::i32)
10105 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10106 Op.getOperand(3), IndexKeyi32);
10108 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10109 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10110 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10111 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10112 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10113 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10114 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10115 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10116 if (
Op.getOperand(4).getValueType() == MVT::i64)
10122 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10123 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10124 Op.getOperand(6)});
10126 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10127 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10128 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10129 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10130 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10131 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10132 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10135 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
10141 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10142 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10143 IndexKey, Op.getOperand(7),
10144 Op.getOperand(8)});
10146 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10147 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10148 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10149 if (
Op.getOperand(6).getValueType() == MVT::i32)
10155 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10156 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10157 IndexKeyi32, Op.getOperand(7)});
10159 case Intrinsic::amdgcn_addrspacecast_nonnull:
10160 return lowerADDRSPACECAST(
Op, DAG);
10161 case Intrinsic::amdgcn_readlane:
10162 case Intrinsic::amdgcn_readfirstlane:
10163 case Intrinsic::amdgcn_writelane:
10164 case Intrinsic::amdgcn_permlane16:
10165 case Intrinsic::amdgcn_permlanex16:
10166 case Intrinsic::amdgcn_permlane64:
10167 case Intrinsic::amdgcn_set_inactive:
10168 case Intrinsic::amdgcn_set_inactive_chain_arg:
10169 case Intrinsic::amdgcn_mov_dpp8:
10170 case Intrinsic::amdgcn_update_dpp:
10172 case Intrinsic::amdgcn_dead: {
10174 for (
const EVT ValTy :
Op.getNode()->values())
10179 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10181 return lowerImage(
Op, ImageDimIntr, DAG,
false);
10192 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10198 unsigned NewOpcode)
const {
10202 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10203 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10221 M->getMemOperand());
10226 unsigned NewOpcode)
const {
10230 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10231 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10249 M->getMemOperand());
10254 unsigned IntrID =
Op.getConstantOperandVal(1);
10258 case Intrinsic::amdgcn_ds_ordered_add:
10259 case Intrinsic::amdgcn_ds_ordered_swap: {
10264 unsigned IndexOperand =
M->getConstantOperandVal(7);
10265 unsigned WaveRelease =
M->getConstantOperandVal(8);
10266 unsigned WaveDone =
M->getConstantOperandVal(9);
10268 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10269 IndexOperand &= ~0x3f;
10270 unsigned CountDw = 0;
10273 CountDw = (IndexOperand >> 24) & 0xf;
10274 IndexOperand &= ~(0xf << 24);
10276 if (CountDw < 1 || CountDw > 4) {
10279 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
10280 DL.getDebugLoc()));
10285 if (IndexOperand) {
10288 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
10291 if (WaveDone && !WaveRelease) {
10295 Fn,
"ds_ordered_count: wave_done requires wave_release",
10296 DL.getDebugLoc()));
10299 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10300 unsigned ShaderType =
10302 unsigned Offset0 = OrderedCountIndex << 2;
10303 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10306 Offset1 |= (CountDw - 1) << 6;
10309 Offset1 |= ShaderType << 2;
10311 unsigned Offset = Offset0 | (Offset1 << 8);
10318 M->getVTList(),
Ops,
M->getMemoryVT(),
10319 M->getMemOperand());
10321 case Intrinsic::amdgcn_raw_buffer_load:
10322 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10323 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10324 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10325 case Intrinsic::amdgcn_raw_buffer_load_format:
10326 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10327 const bool IsFormat =
10328 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10329 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10331 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10332 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10346 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
10348 case Intrinsic::amdgcn_struct_buffer_load:
10349 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10350 case Intrinsic::amdgcn_struct_buffer_load_format:
10351 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10352 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10353 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10354 const bool IsFormat =
10355 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10356 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10358 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10359 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10374 case Intrinsic::amdgcn_raw_tbuffer_load:
10375 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10377 EVT LoadVT =
Op.getValueType();
10378 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10379 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10398 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10401 case Intrinsic::amdgcn_struct_tbuffer_load:
10402 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10404 EVT LoadVT =
Op.getValueType();
10405 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10406 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10425 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10428 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10429 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10431 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10432 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10433 return lowerStructBufferAtomicIntrin(
Op, DAG,
10435 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10436 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10438 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10439 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10440 return lowerStructBufferAtomicIntrin(
Op, DAG,
10442 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10443 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10445 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10446 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10447 return lowerStructBufferAtomicIntrin(
Op, DAG,
10449 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10450 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10452 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10453 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10455 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10456 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10458 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10459 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10461 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10462 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10464 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10465 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10467 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10468 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10470 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10471 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10473 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10474 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10476 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10477 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10479 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10480 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10482 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10483 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10485 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10486 return lowerRawBufferAtomicIntrin(
Op, DAG,
10488 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10489 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10490 return lowerStructBufferAtomicIntrin(
Op, DAG,
10492 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10493 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10495 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10496 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10498 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10499 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10500 return lowerStructBufferAtomicIntrin(
Op, DAG,
10502 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10503 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10504 return lowerStructBufferAtomicIntrin(
Op, DAG,
10506 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10507 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10508 return lowerStructBufferAtomicIntrin(
Op, DAG,
10510 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10511 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10512 return lowerStructBufferAtomicIntrin(
Op, DAG,
10514 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10515 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10517 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10518 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10520 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10521 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10523 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10524 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10526 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10527 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10529 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10530 return lowerStructBufferAtomicIntrin(
Op, DAG,
10533 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10534 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10535 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
10536 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10550 EVT VT =
Op.getValueType();
10554 Op->getVTList(),
Ops, VT,
10555 M->getMemOperand());
10557 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10558 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10559 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
10560 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
10574 EVT VT =
Op.getValueType();
10578 Op->getVTList(),
Ops, VT,
10579 M->getMemOperand());
10581 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10582 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10584 SDValue NodePtr =
M->getOperand(2);
10585 SDValue RayExtent =
M->getOperand(3);
10586 SDValue InstanceMask =
M->getOperand(4);
10587 SDValue RayOrigin =
M->getOperand(5);
10588 SDValue RayDir =
M->getOperand(6);
10590 SDValue TDescr =
M->getOperand(8);
10595 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10600 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10601 const unsigned NumVDataDwords = 10;
10602 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10604 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10605 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10606 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10610 Ops.push_back(NodePtr);
10613 {DAG.getBitcast(MVT::i32, RayExtent),
10614 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10615 Ops.push_back(RayOrigin);
10616 Ops.push_back(RayDir);
10617 Ops.push_back(Offsets);
10618 Ops.push_back(TDescr);
10619 Ops.push_back(
M->getChain());
10622 MachineMemOperand *MemRef =
M->getMemOperand();
10626 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10628 SDValue NodePtr =
M->getOperand(2);
10629 SDValue RayExtent =
M->getOperand(3);
10630 SDValue RayOrigin =
M->getOperand(4);
10631 SDValue RayDir =
M->getOperand(5);
10632 SDValue RayInvDir =
M->getOperand(6);
10633 SDValue TDescr =
M->getOperand(7);
10640 if (!Subtarget->hasGFX10_AEncoding()) {
10650 const unsigned NumVDataDwords = 4;
10651 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10652 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10653 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10656 const unsigned BaseOpcodes[2][2] = {
10657 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10658 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10659 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10663 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10664 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10665 : AMDGPU::MIMGEncGfx10NSA,
10666 NumVDataDwords, NumVAddrDwords);
10670 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10671 : AMDGPU::MIMGEncGfx10Default,
10672 NumVDataDwords, NumVAddrDwords);
10678 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
10681 if (Lanes[0].getValueSizeInBits() == 32) {
10682 for (
unsigned I = 0;
I < 3; ++
I)
10689 Ops.push_back(Lanes[2]);
10701 if (UseNSA && IsGFX11Plus) {
10702 Ops.push_back(NodePtr);
10704 Ops.push_back(RayOrigin);
10709 for (
unsigned I = 0;
I < 3; ++
I) {
10712 {DirLanes[I], InvDirLanes[I]})));
10716 Ops.push_back(RayDir);
10717 Ops.push_back(RayInvDir);
10724 Ops.push_back(NodePtr);
10727 packLanes(RayOrigin,
true);
10728 packLanes(RayDir,
true);
10729 packLanes(RayInvDir,
false);
10734 if (NumVAddrDwords > 12) {
10736 Ops.append(16 -
Ops.size(), Undef);
10742 Ops.push_back(MergedOps);
10745 Ops.push_back(TDescr);
10747 Ops.push_back(
M->getChain());
10750 MachineMemOperand *MemRef =
M->getMemOperand();
10754 case Intrinsic::amdgcn_global_atomic_fmin_num:
10755 case Intrinsic::amdgcn_global_atomic_fmax_num:
10756 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10757 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10764 unsigned Opcode = 0;
10766 case Intrinsic::amdgcn_global_atomic_fmin_num:
10767 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10768 Opcode = ISD::ATOMIC_LOAD_FMIN;
10771 case Intrinsic::amdgcn_global_atomic_fmax_num:
10772 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10773 Opcode = ISD::ATOMIC_LOAD_FMAX;
10779 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
10780 Ops,
M->getMemOperand());
10782 case Intrinsic::amdgcn_s_get_barrier_state:
10783 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10790 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10791 BarID = (BarID >> 4) & 0x3F;
10792 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10795 Ops.push_back(Chain);
10797 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10798 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10806 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
10814 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10815 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10816 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10820 EVT VT =
Op->getValueType(0);
10826 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10828 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10836SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
10843 EVT VT = VTList.
VTs[0];
10846 bool IsTFE = VTList.
NumVTs == 3;
10849 unsigned NumOpDWords = NumValueDWords + 1;
10851 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
10852 MachineMemOperand *OpDWordsMMO =
10854 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
10855 OpDWordsVT, OpDWordsMMO, DAG);
10860 NumValueDWords == 1
10869 if (!Subtarget->hasDwordx3LoadStores() &&
10870 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10874 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
10876 WidenedMemVT, WidenedMMO);
10886 bool ImageStore)
const {
10896 if (Subtarget->hasUnpackedD16VMem()) {
10910 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10921 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
10927 if ((NumElements % 2) == 1) {
10929 unsigned I = Elts.
size() / 2;
10945 if (NumElements == 3) {
10955 return DAG.
getNode(ISD::BITCAST,
DL, WidenedStoreVT, ZExt);
10966 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
10969 switch (IntrinsicID) {
10970 case Intrinsic::amdgcn_exp_compr: {
10971 if (!Subtarget->hasCompressedExport()) {
10974 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
10986 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src0),
10987 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src1),
10996 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11000 case Intrinsic::amdgcn_struct_tbuffer_store:
11001 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11003 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11005 VData = handleD16VData(VData, DAG);
11006 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11007 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11025 M->getMemoryVT(),
M->getMemOperand());
11028 case Intrinsic::amdgcn_raw_tbuffer_store:
11029 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11031 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11033 VData = handleD16VData(VData, DAG);
11034 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11035 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11053 M->getMemoryVT(),
M->getMemOperand());
11056 case Intrinsic::amdgcn_raw_buffer_store:
11057 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11058 case Intrinsic::amdgcn_raw_buffer_store_format:
11059 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11060 const bool IsFormat =
11061 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11062 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11069 VData = handleD16VData(VData, DAG);
11079 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11080 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11100 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
11103 M->getMemoryVT(),
M->getMemOperand());
11106 case Intrinsic::amdgcn_struct_buffer_store:
11107 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11108 case Intrinsic::amdgcn_struct_buffer_store_format:
11109 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11110 const bool IsFormat =
11111 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11112 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11120 VData = handleD16VData(VData, DAG);
11130 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11131 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11150 EVT VDataType = VData.getValueType().getScalarType();
11152 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
11155 M->getMemoryVT(),
M->getMemOperand());
11157 case Intrinsic::amdgcn_raw_buffer_load_lds:
11158 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11159 case Intrinsic::amdgcn_struct_buffer_load_lds:
11160 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11161 if (!Subtarget->hasVMemToLDSLoad())
11165 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11166 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11167 unsigned OpOffset = HasVIndex ? 1 : 0;
11168 SDValue VOffset =
Op.getOperand(5 + OpOffset);
11170 unsigned Size =
Op->getConstantOperandVal(4);
11176 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11177 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11178 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11179 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11182 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11183 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11184 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11185 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11188 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11189 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11190 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11191 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11194 if (!Subtarget->hasLDSLoadB96_B128())
11196 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11197 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11198 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11199 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11202 if (!Subtarget->hasLDSLoadB96_B128())
11204 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11205 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11206 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11207 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11215 if (HasVIndex && HasVOffset)
11219 else if (HasVIndex)
11220 Ops.push_back(
Op.getOperand(5));
11221 else if (HasVOffset)
11222 Ops.push_back(VOffset);
11224 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11225 Ops.push_back(Rsrc);
11226 Ops.push_back(
Op.getOperand(6 + OpOffset));
11227 Ops.push_back(
Op.getOperand(7 + OpOffset));
11229 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
11242 MachineMemOperand *LoadMMO =
M->getMemOperand();
11247 MachinePointerInfo StorePtrI = LoadPtrI;
11271 case Intrinsic::amdgcn_load_to_lds:
11272 case Intrinsic::amdgcn_global_load_lds: {
11273 if (!Subtarget->hasVMemToLDSLoad())
11277 unsigned Size =
Op->getConstantOperandVal(4);
11282 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11285 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11288 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11291 if (!Subtarget->hasLDSLoadB96_B128())
11293 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11296 if (!Subtarget->hasLDSLoadB96_B128())
11298 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11314 if (
LHS->isDivergent())
11318 RHS.getOperand(0).getValueType() == MVT::i32) {
11321 VOffset =
RHS.getOperand(0);
11325 Ops.push_back(Addr);
11333 Ops.push_back(VOffset);
11336 Ops.push_back(
Op.getOperand(5));
11337 Ops.push_back(
Op.getOperand(6));
11342 MachineMemOperand *LoadMMO =
M->getMemOperand();
11344 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
11345 MachinePointerInfo StorePtrI = LoadPtrI;
11364 case Intrinsic::amdgcn_end_cf:
11366 Op->getOperand(2), Chain),
11368 case Intrinsic::amdgcn_s_barrier_init:
11369 case Intrinsic::amdgcn_s_barrier_signal_var: {
11376 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11377 ? AMDGPU::S_BARRIER_INIT_M0
11378 : AMDGPU::S_BARRIER_SIGNAL_M0;
11393 constexpr unsigned ShAmt = 16;
11400 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11405 case Intrinsic::amdgcn_s_barrier_join: {
11414 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11417 unsigned BarID = (BarVal >> 4) & 0x3F;
11420 Ops.push_back(Chain);
11422 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11432 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11438 case Intrinsic::amdgcn_s_prefetch_data: {
11441 return Op.getOperand(0);
11444 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11446 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
11453 Op->getVTList(),
Ops,
M->getMemoryVT(),
11454 M->getMemOperand());
11456 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11457 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11458 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11467 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11469 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11494std::pair<SDValue, SDValue>
11524 unsigned Overflow = ImmOffset & ~MaxImm;
11525 ImmOffset -= Overflow;
11526 if ((int32_t)Overflow < 0) {
11527 Overflow += ImmOffset;
11532 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
11551void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
11553 Align Alignment)
const {
11555 SDLoc
DL(CombinedOffset);
11557 uint32_t
Imm =
C->getZExtValue();
11558 uint32_t SOffset, ImmOffset;
11559 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11569 uint32_t SOffset, ImmOffset;
11572 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
11580 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11589SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
11592 return MaybePointer;
11606 SDValue NumRecords =
Op->getOperand(3);
11609 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11612 std::optional<uint32_t> ConstStride = std::nullopt;
11614 ConstStride = ConstNode->getZExtValue();
11617 if (!ConstStride || *ConstStride != 0) {
11620 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
11631 NewHighHalf, NumRecords, Flags);
11632 SDValue RsrcPtr = DAG.
getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11641 bool IsTFE)
const {
11650 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
11665 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
11669 LoadVal = DAG.
getNode(ISD::BITCAST,
DL, LoadVT, LoadVal);
11679 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11683 Ops[1] = BufferStoreExt;
11688 M->getMemOperand());
11713 DAGCombinerInfo &DCI)
const {
11714 SelectionDAG &DAG = DCI.DAG;
11729 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
11736 "unexpected vector extload");
11749 "unexpected fp extload");
11767 DCI.AddToWorklist(Cvt.
getNode());
11772 DCI.AddToWorklist(Cvt.
getNode());
11775 Cvt = DAG.
getNode(ISD::BITCAST, SL, VT, Cvt);
11783 if (
Info.isEntryFunction())
11784 return Info.getUserSGPRInfo().hasFlatScratchInit();
11792 EVT MemVT =
Load->getMemoryVT();
11793 MachineMemOperand *MMO =
Load->getMemOperand();
11805 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11833 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
11834 "Custom lowering for non-i32 vectors hasn't been implemented.");
11837 unsigned AS =
Load->getAddressSpace();
11844 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
11848 !Subtarget->hasMultiDwordFlatScratchAddressing())
11858 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
11861 Alignment >=
Align(4) && NumElements < 32) {
11863 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11875 if (NumElements > 4)
11878 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11888 switch (Subtarget->getMaxPrivateElementSize()) {
11894 if (NumElements > 2)
11899 if (NumElements > 4)
11902 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11911 auto Flags =
Load->getMemOperand()->getFlags();
11913 Load->getAlign(), Flags, &
Fast) &&
11922 MemVT, *
Load->getMemOperand())) {
11931 EVT VT =
Op.getValueType();
11958 return DAG.
getNode(ISD::BITCAST,
DL, VT, Res);
11968 EVT VT =
Op.getValueType();
11969 const SDNodeFlags
Flags =
Op->getFlags();
11971 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
11977 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11980 if (CLHS->isExactlyValue(1.0)) {
11997 if (CLHS->isExactlyValue(-1.0)) {
12006 if (!AllowInaccurateRcp &&
12007 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
12021 EVT VT =
Op.getValueType();
12022 const SDNodeFlags
Flags =
Op->getFlags();
12024 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
12025 if (!AllowInaccurateDiv)
12046 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
12060 return DAG.
getNode(Opcode, SL, VTList,
12069 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
12083 return DAG.
getNode(Opcode, SL, VTList,
12089 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12090 return FastLowered;
12093 EVT VT =
Op.getValueType();
12100 if (VT == MVT::bf16) {
12123 unsigned FMADOpCode =
12125 SDValue NegRHSExt = DAG.
getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12130 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12132 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
12133 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12139 Tmp = DAG.
getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12149 SDNodeFlags
Flags =
Op->getFlags();
12156 const APFloat K0Val(0x1p+96f);
12159 const APFloat K1Val(0x1p-32f);
12186 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
12187 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
12188 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12193 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12194 return FastLowered;
12200 SDNodeFlags
Flags =
Op->getFlags();
12201 Flags.setNoFPExcept(
true);
12209 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
12220 DAG.
getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12222 using namespace AMDGPU::Hwreg;
12223 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12227 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
12228 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
12231 const bool HasDynamicDenormals =
12237 if (!PreservesDenormals) {
12242 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12245 if (HasDynamicDenormals) {
12249 SavedDenormMode =
SDValue(GetReg, 0);
12255 SDNode *EnableDenorm;
12256 if (Subtarget->hasDenormModeInst()) {
12257 const SDValue EnableDenormValue =
12264 const SDValue EnableDenormValue =
12266 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12267 {EnableDenormValue,
BitField, Glue});
12277 ApproxRcp, One, NegDivScale0, Flags);
12280 ApproxRcp, Fma0, Flags);
12286 NumeratorScaled,
Mul, Flags);
12292 NumeratorScaled, Fma3, Flags);
12294 if (!PreservesDenormals) {
12295 SDNode *DisableDenorm;
12296 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12300 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12306 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
12307 const SDValue DisableDenormValue =
12308 HasDynamicDenormals
12313 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12324 {Fma4, Fma1, Fma3, Scale},
Flags);
12330 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
12331 return FastLowered;
12339 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
12343 SDValue NegDivScale0 = DAG.
getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12363 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12372 SDValue Scale0BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12373 SDValue Scale1BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12399 EVT VT =
Op.getValueType();
12401 if (VT == MVT::f32)
12402 return LowerFDIV32(
Op, DAG);
12404 if (VT == MVT::f64)
12405 return LowerFDIV64(
Op, DAG);
12407 if (VT == MVT::f16 || VT == MVT::bf16)
12408 return LowerFDIV16(
Op, DAG);
12417 EVT ResultExpVT =
Op->getValueType(1);
12418 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12428 if (Subtarget->hasFractBug()) {
12446 EVT VT =
Store->getMemoryVT();
12448 if (VT == MVT::i1) {
12452 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
12456 Store->getValue().getValueType().getScalarType() == MVT::i32);
12458 unsigned AS =
Store->getAddressSpace();
12466 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12470 !Subtarget->hasMultiDwordFlatScratchAddressing())
12477 if (NumElements > 4)
12480 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12484 VT, *
Store->getMemOperand()))
12490 switch (Subtarget->getMaxPrivateElementSize()) {
12494 if (NumElements > 2)
12498 if (NumElements > 4 ||
12499 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12507 auto Flags =
Store->getMemOperand()->getFlags();
12526 assert(!Subtarget->has16BitInsts());
12527 SDNodeFlags
Flags =
Op->getFlags();
12529 DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32,
Op.getOperand(0), Flags);
12541 SDNodeFlags
Flags =
Op->getFlags();
12542 MVT VT =
Op.getValueType().getSimpleVT();
12572 SDValue SqrtSNextDown = DAG.
getNode(ISD::BITCAST,
DL, VT, SqrtSNextDownInt);
12575 DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextDown, Flags);
12584 SDValue NegSqrtSNextUp = DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextUp, Flags);
12650 SDNodeFlags
Flags =
Op->getFlags();
12696 SqrtRet = DAG.
getNode(ISD::FLDEXP,
DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12713 EVT VT =
Op.getValueType();
12723 if (Subtarget->hasTrigReducedRange()) {
12730 switch (
Op.getOpcode()) {
12757 EVT VT =
Op.getValueType();
12765 Op->getVTList(),
Ops, VT,
12774SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
12775 DAGCombinerInfo &DCI)
const {
12776 EVT VT =
N->getValueType(0);
12778 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12781 SelectionDAG &DAG = DCI.DAG;
12785 EVT SrcVT = Src.getValueType();
12791 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12794 DCI.AddToWorklist(Cvt.
getNode());
12797 if (ScalarVT != MVT::f32) {
12809 DAGCombinerInfo &DCI)
const {
12816 if (SignOp.
getOpcode() == ISD::FP_EXTEND ||
12820 SelectionDAG &DAG = DCI.DAG;
12839 for (
unsigned I = 0;
I != NumElts; ++
I) {
12863 if (NewElts.
size() == 1)
12885 for (
unsigned I = 0;
I != NumElts; ++
I) {
12920SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
12922 DAGCombinerInfo &DCI)
const {
12940 SelectionDAG &DAG = DCI.DAG;
12953 AM.BaseOffs =
Offset.getSExtValue();
12958 EVT VT =
N->getValueType(0);
12964 Flags.setNoUnsignedWrap(
12965 N->getFlags().hasNoUnsignedWrap() &&
12975 switch (
N->getOpcode()) {
12986 DAGCombinerInfo &DCI)
const {
12987 SelectionDAG &DAG = DCI.DAG;
12994 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
12995 N->getMemoryVT(), DCI);
12999 NewOps[PtrIdx] = NewPtr;
13008 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13009 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13018SDValue SITargetLowering::splitBinaryBitConstantOp(
13022 uint32_t ValLo =
Lo_32(Val);
13023 uint32_t ValHi =
Hi_32(Val);
13030 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
13044 if (V.getValueType() != MVT::i1)
13046 switch (V.getOpcode()) {
13063 return V.getResNo() == 1;
13065 unsigned IntrinsicID = V.getConstantOperandVal(0);
13066 switch (IntrinsicID) {
13067 case Intrinsic::amdgcn_is_shared:
13068 case Intrinsic::amdgcn_is_private:
13085 if (!(
C & 0x000000ff))
13086 ZeroByteMask |= 0x000000ff;
13087 if (!(
C & 0x0000ff00))
13088 ZeroByteMask |= 0x0000ff00;
13089 if (!(
C & 0x00ff0000))
13090 ZeroByteMask |= 0x00ff0000;
13091 if (!(
C & 0xff000000))
13092 ZeroByteMask |= 0xff000000;
13093 uint32_t NonZeroByteMask = ~ZeroByteMask;
13094 if ((NonZeroByteMask &
C) != NonZeroByteMask)
13107 assert(V.getValueSizeInBits() == 32);
13109 if (V.getNumOperands() != 2)
13118 switch (V.getOpcode()) {
13123 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13128 return (0x03020100 & ~ConstMask) | ConstMask;
13135 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
13141 return uint32_t(0x0c0c0c0c03020100ull >>
C);
13148 DAGCombinerInfo &DCI)
const {
13149 if (DCI.isBeforeLegalize())
13152 SelectionDAG &DAG = DCI.DAG;
13153 EVT VT =
N->getValueType(0);
13158 if (VT == MVT::i64 && CRHS) {
13160 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
13164 if (CRHS && VT == MVT::i32) {
13174 unsigned Shift = CShift->getZExtValue();
13176 unsigned Offset = NB + Shift;
13177 if ((
Offset & (Bits - 1)) == 0) {
13201 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13216 if (
Y.getOpcode() != ISD::FABS ||
Y.getOperand(0) !=
X ||
13221 if (
X !=
LHS.getOperand(1))
13225 const ConstantFPSDNode *C1 =
13259 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
13260 LHS.getOperand(0) ==
LHS.getOperand(1))) {
13262 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
13263 :
Mask->getZExtValue() & OrdMask;
13284 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13287 if (LHSMask != ~0u && RHSMask != ~0u) {
13290 if (LHSMask > RHSMask) {
13297 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13298 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13301 if (!(LHSUsedLanes & RHSUsedLanes) &&
13304 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13310 uint32_t
Mask = LHSMask & RHSMask;
13311 for (
unsigned I = 0;
I < 32;
I += 8) {
13312 uint32_t ByteSel = 0xff <<
I;
13313 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13314 Mask &= (0x0c <<
I) & 0xffffffff;
13319 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
13372static const std::optional<ByteProvider<SDValue>>
13374 unsigned Depth = 0) {
13377 return std::nullopt;
13379 if (
Op.getValueSizeInBits() < 8)
13380 return std::nullopt;
13382 if (
Op.getValueType().isVector())
13385 switch (
Op->getOpcode()) {
13397 NarrowVT = VTSign->getVT();
13400 return std::nullopt;
13403 if (SrcIndex >= NarrowByteWidth)
13404 return std::nullopt;
13412 return std::nullopt;
13414 uint64_t BitShift = ShiftOp->getZExtValue();
13416 if (BitShift % 8 != 0)
13417 return std::nullopt;
13419 SrcIndex += BitShift / 8;
13437static const std::optional<ByteProvider<SDValue>>
13439 unsigned StartingIndex = 0) {
13443 return std::nullopt;
13445 unsigned BitWidth =
Op.getScalarValueSizeInBits();
13447 return std::nullopt;
13449 return std::nullopt;
13451 bool IsVec =
Op.getValueType().isVector();
13452 switch (
Op.getOpcode()) {
13455 return std::nullopt;
13460 return std::nullopt;
13464 return std::nullopt;
13467 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
13468 return std::nullopt;
13469 if (!
LHS ||
LHS->isConstantZero())
13471 if (!
RHS ||
RHS->isConstantZero())
13473 return std::nullopt;
13478 return std::nullopt;
13482 return std::nullopt;
13484 uint32_t BitMask = BitMaskOp->getZExtValue();
13486 uint32_t IndexMask = 0xFF << (Index * 8);
13488 if ((IndexMask & BitMask) != IndexMask) {
13491 if (IndexMask & BitMask)
13492 return std::nullopt;
13501 return std::nullopt;
13505 if (!ShiftOp ||
Op.getValueType().isVector())
13506 return std::nullopt;
13508 uint64_t BitsProvided =
Op.getValueSizeInBits();
13509 if (BitsProvided % 8 != 0)
13510 return std::nullopt;
13512 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13514 return std::nullopt;
13516 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13517 uint64_t ByteShift = BitShift / 8;
13519 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13520 uint64_t BytesProvided = BitsProvided / 8;
13521 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13522 NewIndex %= BytesProvided;
13529 return std::nullopt;
13533 return std::nullopt;
13535 uint64_t BitShift = ShiftOp->getZExtValue();
13537 return std::nullopt;
13539 auto BitsProvided =
Op.getScalarValueSizeInBits();
13540 if (BitsProvided % 8 != 0)
13541 return std::nullopt;
13543 uint64_t BytesProvided = BitsProvided / 8;
13544 uint64_t ByteShift = BitShift / 8;
13549 return BytesProvided - ByteShift > Index
13557 return std::nullopt;
13561 return std::nullopt;
13563 uint64_t BitShift = ShiftOp->getZExtValue();
13564 if (BitShift % 8 != 0)
13565 return std::nullopt;
13566 uint64_t ByteShift = BitShift / 8;
13572 return Index < ByteShift
13575 Depth + 1, StartingIndex);
13584 return std::nullopt;
13592 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13594 if (NarrowBitWidth % 8 != 0)
13595 return std::nullopt;
13596 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13598 if (Index >= NarrowByteWidth)
13600 ? std::optional<ByteProvider<SDValue>>(
13608 return std::nullopt;
13612 if (NarrowByteWidth >= Index) {
13617 return std::nullopt;
13624 return std::nullopt;
13630 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13631 if (NarrowBitWidth % 8 != 0)
13632 return std::nullopt;
13633 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13638 if (Index >= NarrowByteWidth) {
13640 ? std::optional<ByteProvider<SDValue>>(
13645 if (NarrowByteWidth > Index) {
13649 return std::nullopt;
13654 return std::nullopt;
13657 Depth + 1, StartingIndex);
13663 return std::nullopt;
13664 auto VecIdx = IdxOp->getZExtValue();
13665 auto ScalarSize =
Op.getScalarValueSizeInBits();
13666 if (ScalarSize < 32)
13667 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13669 StartingIndex, Index);
13674 return std::nullopt;
13678 return std::nullopt;
13681 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13682 if (IdxMask > 0x07 && IdxMask != 0x0c)
13683 return std::nullopt;
13685 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13686 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13688 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
13694 return std::nullopt;
13709 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
13716 auto MemVT = L->getMemoryVT();
13719 return L->getMemoryVT().getSizeInBits() == 16;
13729 int Low8 = Mask & 0xff;
13730 int Hi8 = (Mask & 0xff00) >> 8;
13732 assert(Low8 < 8 && Hi8 < 8);
13734 bool IsConsecutive = (Hi8 - Low8 == 1);
13739 bool Is16Aligned = !(Low8 % 2);
13741 return IsConsecutive && Is16Aligned;
13749 int Low16 = PermMask & 0xffff;
13750 int Hi16 = (PermMask & 0xffff0000) >> 16;
13760 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13762 if (!OtherOpIs16Bit)
13770 unsigned DWordOffset) {
13775 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13780 if (Src.getValueType().isVector()) {
13781 auto ScalarTySize = Src.getScalarValueSizeInBits();
13782 auto ScalarTy = Src.getValueType().getScalarType();
13783 if (ScalarTySize == 32) {
13787 if (ScalarTySize > 32) {
13790 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13791 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13798 assert(ScalarTySize < 32);
13799 auto NumElements =
TypeSize / ScalarTySize;
13800 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13801 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13802 auto NumElementsIn32 = 32 / ScalarTySize;
13803 auto NumAvailElements = DWordOffset < Trunc32Elements
13805 : NumElements - NormalizedTrunc;
13818 auto ShiftVal = 32 * DWordOffset;
13826 [[maybe_unused]]
EVT VT =
N->getValueType(0);
13831 for (
int i = 0; i < 4; i++) {
13833 std::optional<ByteProvider<SDValue>>
P =
13836 if (!
P ||
P->isConstantZero())
13841 if (PermNodes.
size() != 4)
13844 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13845 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13847 for (
size_t i = 0; i < PermNodes.
size(); i++) {
13848 auto PermOp = PermNodes[i];
13851 int SrcByteAdjust = 4;
13855 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13856 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13858 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13859 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13863 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13864 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13867 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13869 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13872 SDValue Op = *PermNodes[FirstSrc.first].Src;
13874 assert(
Op.getValueSizeInBits() == 32);
13878 int Low16 = PermMask & 0xffff;
13879 int Hi16 = (PermMask & 0xffff0000) >> 16;
13881 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13882 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13885 if (WellFormedLow && WellFormedHi)
13889 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
13898 assert(
Op.getValueType().isByteSized() &&
13916 DAGCombinerInfo &DCI)
const {
13917 SelectionDAG &DAG = DCI.DAG;
13921 EVT VT =
N->getValueType(0);
13922 if (VT == MVT::i1) {
13927 if (Src !=
RHS.getOperand(0))
13932 if (!CLHS || !CRHS)
13936 static const uint32_t MaxMask = 0x3ff;
13956 Sel |=
LHS.getConstantOperandVal(2);
13965 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13969 auto usesCombinedOperand = [](SDNode *OrUse) {
13971 if (OrUse->getOpcode() != ISD::BITCAST ||
13972 !OrUse->getValueType(0).isVector())
13976 for (
auto *VUser : OrUse->users()) {
13977 if (!VUser->getValueType(0).isVector())
13984 if (VUser->getOpcode() == VectorwiseOp)
13990 if (!
any_of(
N->users(), usesCombinedOperand))
13996 if (LHSMask != ~0u && RHSMask != ~0u) {
13999 if (LHSMask > RHSMask) {
14006 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14007 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14010 if (!(LHSUsedLanes & RHSUsedLanes) &&
14013 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14015 LHSMask &= ~RHSUsedLanes;
14016 RHSMask &= ~LHSUsedLanes;
14018 LHSMask |= LHSUsedLanes & 0x04040404;
14020 uint32_t Sel = LHSMask | RHSMask;
14028 if (LHSMask == ~0u || RHSMask == ~0u) {
14069 return IdentitySrc;
14075 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14090 if (SrcVT == MVT::i32) {
14095 DCI.AddToWorklist(LowOr.
getNode());
14096 DCI.AddToWorklist(HiBits.getNode());
14100 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14107 N->getOperand(0), CRHS))
14115 DAGCombinerInfo &DCI)
const {
14116 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
14123 SelectionDAG &DAG = DCI.DAG;
14125 EVT VT =
N->getValueType(0);
14126 if (CRHS && VT == MVT::i64) {
14128 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
14135 unsigned Opc =
LHS.getOpcode();
14159 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(1));
14161 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(2));
14165 LHS->getOperand(0), FNegLHS, FNegRHS);
14166 return DAG.
getNode(ISD::BITCAST,
DL, VT, NewSelect);
14174 DAGCombinerInfo &DCI)
const {
14175 if (!Subtarget->has16BitInsts() ||
14179 EVT VT =
N->getValueType(0);
14180 if (VT != MVT::i32)
14184 if (Src.getValueType() != MVT::i16)
14191SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
14192 DAGCombinerInfo &DCI)
const {
14199 VTSign->getVT() == MVT::i8) ||
14201 VTSign->getVT() == MVT::i16))) {
14202 assert(Subtarget->hasScalarSubwordLoads() &&
14203 "s_buffer_load_{u8, i8} are supported "
14204 "in GFX12 (or newer) architectures.");
14205 EVT VT = Src.getValueType();
14210 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14217 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14218 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14223 VTSign->getVT() == MVT::i8) ||
14225 VTSign->getVT() == MVT::i16)) &&
14234 Src.getOperand(6), Src.getOperand(7)};
14237 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14241 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14242 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14243 return DCI.DAG.getMergeValues(
14244 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
14250 DAGCombinerInfo &DCI)
const {
14251 SelectionDAG &DAG = DCI.DAG;
14258 if (
N->getOperand(0).isUndef())
14265 DAGCombinerInfo &DCI)
const {
14266 EVT VT =
N->getValueType(0);
14281 if ((VT == MVT::f16 && N0.
getOpcode() == ISD::FSQRT) &&
14291 unsigned MaxDepth)
const {
14292 unsigned Opcode =
Op.getOpcode();
14297 const auto &
F = CFP->getValueAPF();
14298 if (
F.isNaN() &&
F.isSignaling())
14300 if (!
F.isDenormal())
14326 case ISD::FP_EXTEND:
14327 case ISD::FP16_TO_FP:
14328 case ISD::FP_TO_FP16:
14329 case ISD::BF16_TO_FP:
14330 case ISD::FP_TO_BF16:
14363 if (
Op.getValueType() == MVT::i32) {
14369 if (RHS->getZExtValue() == 0xffff0000) {
14379 return Op.getValueType().getScalarType() != MVT::f16;
14383 case ISD::FMINNUM_IEEE:
14384 case ISD::FMAXNUM_IEEE:
14385 case ISD::FMINIMUM:
14386 case ISD::FMAXIMUM:
14387 case ISD::FMINIMUMNUM:
14388 case ISD::FMAXIMUMNUM:
14400 if (Subtarget->supportsMinMaxDenormModes() ||
14410 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
14422 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
14449 if (
Op.getValueType() == MVT::i16) {
14452 TruncSrc.
getOpcode() == ISD::BITCAST &&
14460 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
14462 switch (IntrinsicID) {
14463 case Intrinsic::amdgcn_cvt_pkrtz:
14464 case Intrinsic::amdgcn_cubeid:
14465 case Intrinsic::amdgcn_frexp_mant:
14466 case Intrinsic::amdgcn_fdot2:
14467 case Intrinsic::amdgcn_rcp:
14468 case Intrinsic::amdgcn_rsq:
14469 case Intrinsic::amdgcn_rsq_clamp:
14470 case Intrinsic::amdgcn_rcp_legacy:
14471 case Intrinsic::amdgcn_rsq_legacy:
14472 case Intrinsic::amdgcn_trig_preop:
14473 case Intrinsic::amdgcn_tanh:
14474 case Intrinsic::amdgcn_log:
14475 case Intrinsic::amdgcn_exp2:
14476 case Intrinsic::amdgcn_sqrt:
14494 unsigned MaxDepth)
const {
14497 unsigned Opcode =
MI->getOpcode();
14499 if (Opcode == AMDGPU::G_FCANONICALIZE)
14502 std::optional<FPValueAndVReg> FCR;
14505 if (FCR->Value.isSignaling())
14507 if (!FCR->Value.isDenormal())
14518 case AMDGPU::G_FADD:
14519 case AMDGPU::G_FSUB:
14520 case AMDGPU::G_FMUL:
14521 case AMDGPU::G_FCEIL:
14522 case AMDGPU::G_FFLOOR:
14523 case AMDGPU::G_FRINT:
14524 case AMDGPU::G_FNEARBYINT:
14525 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14526 case AMDGPU::G_INTRINSIC_TRUNC:
14527 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14528 case AMDGPU::G_FMA:
14529 case AMDGPU::G_FMAD:
14530 case AMDGPU::G_FSQRT:
14531 case AMDGPU::G_FDIV:
14532 case AMDGPU::G_FREM:
14533 case AMDGPU::G_FPOW:
14534 case AMDGPU::G_FPEXT:
14535 case AMDGPU::G_FLOG:
14536 case AMDGPU::G_FLOG2:
14537 case AMDGPU::G_FLOG10:
14538 case AMDGPU::G_FPTRUNC:
14539 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14540 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14541 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14542 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14543 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14545 case AMDGPU::G_FNEG:
14546 case AMDGPU::G_FABS:
14547 case AMDGPU::G_FCOPYSIGN:
14549 case AMDGPU::G_FMINNUM:
14550 case AMDGPU::G_FMAXNUM:
14551 case AMDGPU::G_FMINNUM_IEEE:
14552 case AMDGPU::G_FMAXNUM_IEEE:
14553 case AMDGPU::G_FMINIMUM:
14554 case AMDGPU::G_FMAXIMUM:
14555 case AMDGPU::G_FMINIMUMNUM:
14556 case AMDGPU::G_FMAXIMUMNUM: {
14557 if (Subtarget->supportsMinMaxDenormModes() ||
14564 case AMDGPU::G_BUILD_VECTOR:
14569 case AMDGPU::G_INTRINSIC:
14570 case AMDGPU::G_INTRINSIC_CONVERGENT:
14572 case Intrinsic::amdgcn_fmul_legacy:
14573 case Intrinsic::amdgcn_fmad_ftz:
14574 case Intrinsic::amdgcn_sqrt:
14575 case Intrinsic::amdgcn_fmed3:
14576 case Intrinsic::amdgcn_sin:
14577 case Intrinsic::amdgcn_cos:
14578 case Intrinsic::amdgcn_log:
14579 case Intrinsic::amdgcn_exp2:
14580 case Intrinsic::amdgcn_log_clamp:
14581 case Intrinsic::amdgcn_rcp:
14582 case Intrinsic::amdgcn_rcp_legacy:
14583 case Intrinsic::amdgcn_rsq:
14584 case Intrinsic::amdgcn_rsq_clamp:
14585 case Intrinsic::amdgcn_rsq_legacy:
14586 case Intrinsic::amdgcn_div_scale:
14587 case Intrinsic::amdgcn_div_fmas:
14588 case Intrinsic::amdgcn_div_fixup:
14589 case Intrinsic::amdgcn_fract:
14590 case Intrinsic::amdgcn_cvt_pkrtz:
14591 case Intrinsic::amdgcn_cubeid:
14592 case Intrinsic::amdgcn_cubema:
14593 case Intrinsic::amdgcn_cubesc:
14594 case Intrinsic::amdgcn_cubetc:
14595 case Intrinsic::amdgcn_frexp_mant:
14596 case Intrinsic::amdgcn_fdot2:
14597 case Intrinsic::amdgcn_trig_preop:
14598 case Intrinsic::amdgcn_tanh:
14617 if (
C.isDenormal()) {
14631 if (
C.isSignaling()) {
14654SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
14655 DAGCombinerInfo &DCI)
const {
14656 SelectionDAG &DAG = DCI.DAG;
14658 EVT VT =
N->getValueType(0);
14667 EVT VT =
N->getValueType(0);
14668 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
14684 EVT EltVT =
Lo.getValueType();
14687 for (
unsigned I = 0;
I != 2; ++
I) {
14691 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14692 }
else if (
Op.isUndef()) {
14726 case ISD::FMAXNUM_IEEE:
14727 case ISD::FMAXIMUMNUM:
14729 case ISD::FMAXIMUM:
14736 case ISD::FMINNUM_IEEE:
14737 case ISD::FMINIMUMNUM:
14739 case ISD::FMINIMUM:
14765 if (!MinK || !MaxK)
14778 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14779 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14838 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
14844 if (
Info->getMode().DX10Clamp) {
14853 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14881 case ISD::FMINNUM_IEEE:
14882 case ISD::FMAXNUM_IEEE:
14883 case ISD::FMINIMUMNUM:
14884 case ISD::FMAXIMUMNUM:
14887 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
14889 case ISD::FMINIMUM:
14890 case ISD::FMAXIMUM:
14898 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
14907 DAGCombinerInfo &DCI)
const {
14908 SelectionDAG &DAG = DCI.DAG;
14940 if (
SDValue Med3 = performIntMed3ImmCombine(
14945 if (
SDValue Med3 = performIntMed3ImmCombine(
14951 if (
SDValue Med3 = performIntMed3ImmCombine(
14956 if (
SDValue Med3 = performIntMed3ImmCombine(
14966 if (((
Opc == ISD::FMINNUM && Op0.
getOpcode() == ISD::FMAXNUM) ||
14967 (
Opc == ISD::FMINNUM_IEEE && Op0.
getOpcode() == ISD::FMAXNUM_IEEE) ||
14968 (
Opc == ISD::FMINIMUMNUM && Op0.
getOpcode() == ISD::FMAXIMUMNUM) ||
14971 (VT == MVT::f32 || VT == MVT::f64 ||
14972 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14973 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14974 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14975 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14977 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1))
14984 const SDNodeFlags
Flags =
N->getFlags();
14985 if ((
Opc == ISD::FMINIMUM ||
Opc == ISD::FMAXIMUM) &&
14986 !Subtarget->hasIEEEMinimumMaximumInsts() &&
Flags.hasNoNaNs()) {
14988 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14989 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
14999 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15000 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15009 DAGCombinerInfo &DCI)
const {
15010 EVT VT =
N->getValueType(0);
15014 SelectionDAG &DAG = DCI.DAG;
15029 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15033 if (
Info->getMode().DX10Clamp) {
15053 DAGCombinerInfo &DCI)
const {
15057 return DCI.DAG.getUNDEF(
N->getValueType(0));
15065 bool IsDivergentIdx,
15070 unsigned VecSize = EltSize * NumElem;
15073 if (VecSize <= 64 && EltSize < 32)
15082 if (IsDivergentIdx)
15086 unsigned NumInsts = NumElem +
15087 ((EltSize + 31) / 32) * NumElem ;
15091 if (Subtarget->useVGPRIndexMode())
15092 return NumInsts <= 16;
15096 if (Subtarget->hasMovrel())
15097 return NumInsts <= 15;
15103 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
15118SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
15119 DAGCombinerInfo &DCI)
const {
15125 EVT ResVT =
N->getValueType(0);
15149 if (!
C ||
C->getZExtValue() != 0x1f)
15165 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15184 case ISD::FMAXNUM_IEEE:
15185 case ISD::FMINNUM_IEEE:
15186 case ISD::FMAXIMUM:
15187 case ISD::FMINIMUM: {
15193 DCI.AddToWorklist(Elt0.
getNode());
15194 DCI.AddToWorklist(Elt1.
getNode());
15216 if (!DCI.isBeforeLegalize())
15224 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15227 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15228 unsigned EltIdx = BitIndex / 32;
15229 unsigned LeftoverBitIdx = BitIndex % 32;
15233 DCI.AddToWorklist(Cast.
getNode());
15237 DCI.AddToWorklist(Elt.
getNode());
15240 DCI.AddToWorklist(Srl.
getNode());
15244 DCI.AddToWorklist(Trunc.
getNode());
15246 if (VecEltVT == ResVT) {
15247 return DAG.
getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15258SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
15259 DAGCombinerInfo &DCI)
const {
15270 SelectionDAG &DAG = DCI.DAG;
15289 if (Src.getOpcode() == ISD::FP_EXTEND &&
15290 Src.getOperand(0).getValueType() == MVT::f16) {
15291 return Src.getOperand(0);
15295 APFloat Val = CFP->getValueAPF();
15296 bool LosesInfo =
true;
15306 DAGCombinerInfo &DCI)
const {
15307 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15308 "combine only useful on gfx8");
15310 SDValue TruncSrc =
N->getOperand(0);
15311 EVT VT =
N->getValueType(0);
15312 if (VT != MVT::f16)
15319 SelectionDAG &DAG = DCI.DAG;
15347 return DAG.
getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15350unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
15352 const SDNode *N1)
const {
15357 if (((VT == MVT::f32 &&
15359 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15379 EVT VT =
N->getValueType(0);
15380 if (VT != MVT::i32 && VT != MVT::i64)
15386 unsigned Opc =
N->getOpcode();
15441 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
15460 DAGCombinerInfo &DCI)
const {
15463 SelectionDAG &DAG = DCI.DAG;
15464 EVT VT =
N->getValueType(0);
15474 if (!
N->isDivergent() && Subtarget->hasSMulHi())
15478 if (NumBits <= 32 || NumBits > 64)
15489 if (!Subtarget->hasFullRate64Ops()) {
15490 unsigned NumUsers = 0;
15491 for (SDNode *User :
LHS->
users()) {
15494 if (!
User->isAnyAdd())
15518 bool MulSignedLo =
false;
15519 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15528 if (VT != MVT::i64) {
15551 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15553 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15554 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15556 if (!MulLHSUnsigned32) {
15563 if (!MulRHSUnsigned32) {
15574 if (VT != MVT::i64)
15580SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
15581 DAGCombinerInfo &DCI)
const {
15591 SelectionDAG &DAG = DCI.DAG;
15606 unsigned Opcode =
N->getOpcode();
15607 if (Opcode == ISD::PTRADD)
15610 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
15621static std::optional<ByteProvider<SDValue>>
15624 if (!Byte0 || Byte0->isConstantZero()) {
15625 return std::nullopt;
15628 if (Byte1 && !Byte1->isConstantZero()) {
15629 return std::nullopt;
15635 unsigned FirstCs =
First & 0x0c0c0c0c;
15636 unsigned SecondCs = Second & 0x0c0c0c0c;
15637 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
15638 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15640 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15641 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15642 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15643 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15645 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15669 for (
int BPI = 0; BPI < 2; BPI++) {
15672 BPP = {Src1, Src0};
15674 unsigned ZeroMask = 0x0c0c0c0c;
15675 unsigned FMask = 0xFF << (8 * (3 - Step));
15677 unsigned FirstMask =
15678 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15679 unsigned SecondMask =
15680 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15684 int FirstGroup = -1;
15685 for (
int I = 0;
I < 2;
I++) {
15687 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
15688 return IterElt.SrcOp == *BPP.first.Src &&
15689 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15693 if (Match != Srcs.
end()) {
15694 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
15699 if (FirstGroup != -1) {
15701 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
15702 return IterElt.SrcOp == *BPP.second.Src &&
15703 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15706 if (Match != Srcs.
end()) {
15707 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
15709 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15717 unsigned ZeroMask = 0x0c0c0c0c;
15718 unsigned FMask = 0xFF << (8 * (3 - Step));
15722 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15726 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15735 if (Srcs.
size() == 1) {
15736 auto *Elt = Srcs.
begin();
15740 if (Elt->PermMask == 0x3020100)
15747 auto *FirstElt = Srcs.
begin();
15748 auto *SecondElt = std::next(FirstElt);
15755 auto FirstMask = FirstElt->PermMask;
15756 auto SecondMask = SecondElt->PermMask;
15758 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15759 unsigned FirstPlusFour = FirstMask | 0x04040404;
15762 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15774 FirstElt = std::next(SecondElt);
15775 if (FirstElt == Srcs.
end())
15778 SecondElt = std::next(FirstElt);
15781 if (SecondElt == Srcs.
end()) {
15787 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
15793 return Perms.
size() == 2
15799 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15800 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15801 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15802 EntryMask += ZeroMask;
15807 auto Opcode =
Op.getOpcode();
15813static std::optional<bool>
15824 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15827 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15829 assert(!(S0IsUnsigned && S0IsSigned));
15830 assert(!(S1IsUnsigned && S1IsSigned));
15838 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15844 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15845 return std::nullopt;
15857 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15858 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15863 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15869 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15870 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15871 return std::nullopt;
15877 DAGCombinerInfo &DCI)
const {
15878 SelectionDAG &DAG = DCI.DAG;
15879 EVT VT =
N->getValueType(0);
15885 if (Subtarget->hasMad64_32()) {
15886 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
15891 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
15895 if (VT == MVT::i64) {
15896 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15901 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15903 std::optional<bool> IsSigned;
15909 int ChainLength = 0;
15910 for (
int I = 0;
I < 4;
I++) {
15914 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15917 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15922 TempNode->getOperand(MulIdx), *Src0, *Src1,
15923 TempNode->getOperand(MulIdx)->getOperand(0),
15924 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15928 IsSigned = *IterIsSigned;
15929 if (*IterIsSigned != *IsSigned)
15932 auto AddIdx = 1 - MulIdx;
15935 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
15936 Src2s.
push_back(TempNode->getOperand(AddIdx));
15946 TempNode->getOperand(AddIdx), *Src0, *Src1,
15947 TempNode->getOperand(AddIdx)->getOperand(0),
15948 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15952 if (*IterIsSigned != *IsSigned)
15956 ChainLength =
I + 2;
15960 TempNode = TempNode->getOperand(AddIdx);
15962 ChainLength =
I + 1;
15963 if (TempNode->getNumOperands() < 2)
15965 LHS = TempNode->getOperand(0);
15966 RHS = TempNode->getOperand(1);
15969 if (ChainLength < 2)
15975 if (ChainLength < 4) {
15985 bool UseOriginalSrc =
false;
15986 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
15987 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
15988 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
15989 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
15990 SmallVector<unsigned, 4> SrcBytes;
15991 auto Src0Mask = Src0s.
begin()->PermMask;
15992 SrcBytes.
push_back(Src0Mask & 0xFF000000);
15993 bool UniqueEntries =
true;
15994 for (
auto I = 1;
I < 4;
I++) {
15995 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
15998 UniqueEntries =
false;
16004 if (UniqueEntries) {
16005 UseOriginalSrc =
true;
16007 auto *FirstElt = Src0s.
begin();
16011 auto *SecondElt = Src1s.
begin();
16013 SecondElt->DWordOffset);
16022 if (!UseOriginalSrc) {
16029 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16032 : Intrinsic::amdgcn_udot4,
16042 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16047 unsigned Opc =
LHS.getOpcode();
16059 auto Cond =
RHS.getOperand(0);
16064 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16081 DAGCombinerInfo &DCI)
const {
16082 SelectionDAG &DAG = DCI.DAG;
16084 EVT VT =
N->getValueType(0);
16097 SDNodeFlags ShlFlags = N1->
getFlags();
16101 SDNodeFlags NewShlFlags =
16106 DCI.AddToWorklist(Inner.
getNode());
16113 if (Subtarget->hasMad64_32()) {
16114 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16123 if (VT == MVT::i64) {
16124 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16132 if (
const GlobalAddressSDNode *GA =
16137 SDNodeFlags
Flags =
16140 DCI.AddToWorklist(Inner.
getNode());
16168 SDNodeFlags ReassocFlags =
16171 if (ZIsConstant != YIsConstant) {
16175 DCI.AddToWorklist(Inner.
getNode());
16183 assert(!YIsConstant && !ZIsConstant);
16185 if (!
X->isDivergent() &&
Y->isDivergent() !=
Z->isDivergent()) {
16194 if (
Y->isDivergent())
16197 DCI.AddToWorklist(UniformInner.
getNode());
16205 DAGCombinerInfo &DCI)
const {
16206 SelectionDAG &DAG = DCI.DAG;
16207 EVT VT =
N->getValueType(0);
16209 if (VT == MVT::i64) {
16210 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16214 if (VT != MVT::i32)
16223 unsigned Opc =
RHS.getOpcode();
16230 auto Cond =
RHS.getOperand(0);
16235 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16253SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
16254 DAGCombinerInfo &DCI)
const {
16256 if (
N->getValueType(0) != MVT::i32)
16262 SelectionDAG &DAG = DCI.DAG;
16267 unsigned LHSOpc =
LHS.getOpcode();
16268 unsigned Opc =
N->getOpcode();
16272 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
16278 DAGCombinerInfo &DCI)
const {
16282 SelectionDAG &DAG = DCI.DAG;
16283 EVT VT =
N->getValueType(0);
16295 if (
A ==
LHS.getOperand(1)) {
16296 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16297 if (FusedOp != 0) {
16299 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
16307 if (
A ==
RHS.getOperand(1)) {
16308 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16309 if (FusedOp != 0) {
16311 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
16320 DAGCombinerInfo &DCI)
const {
16324 SelectionDAG &DAG = DCI.DAG;
16326 EVT VT =
N->getValueType(0);
16339 if (
A ==
LHS.getOperand(1)) {
16340 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16341 if (FusedOp != 0) {
16345 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
16354 if (
A ==
RHS.getOperand(1)) {
16355 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16356 if (FusedOp != 0) {
16358 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
16367 DAGCombinerInfo &DCI)
const {
16368 SelectionDAG &DAG = DCI.DAG;
16370 EVT VT =
N->getValueType(0);
16371 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16377 SDNodeFlags
Flags =
N->getFlags();
16378 SDNodeFlags RHSFlags =
RHS->getFlags();
16384 bool IsNegative =
false;
16385 if (CLHS->isExactlyValue(1.0) ||
16386 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16389 if (
RHS.getOpcode() == ISD::FSQRT) {
16393 return IsNegative ? DAG.
getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16402 DAGCombinerInfo &DCI)
const {
16403 SelectionDAG &DAG = DCI.DAG;
16404 EVT VT =
N->getValueType(0);
16408 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
16409 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16424 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16429 const ConstantFPSDNode *FalseNode =
16439 if (ScalarVT == MVT::f32 &&
16445 if (TrueNodeExpVal == INT_MIN)
16448 if (FalseNodeExpVal == INT_MIN)
16461 return DAG.
getNode(ISD::FLDEXP, SL, VT,
LHS, SelectNode,
N->getFlags());
16468 DAGCombinerInfo &DCI)
const {
16469 SelectionDAG &DAG = DCI.DAG;
16470 EVT VT =
N->getValueType(0);
16473 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16491 (
N->getFlags().hasAllowContract() &&
16492 FMA->getFlags().hasAllowContract())) {
16507 if (FMAOp1.
getOpcode() != ISD::FP_EXTEND ||
16526 if (Vec1 == Vec2 || Vec3 == Vec4)
16532 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16541 DAGCombinerInfo &DCI)
const {
16542 SelectionDAG &DAG = DCI.DAG;
16547 EVT VT =
LHS.getValueType();
16576 return LHS.getOperand(0);
16584 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
16591 const APInt &CT =
LHS.getConstantOperandAPInt(1);
16592 const APInt &CF =
LHS.getConstantOperandAPInt(2);
16600 return LHS.getOperand(0);
16604 if (VT != MVT::f32 && VT != MVT::f64 &&
16605 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16613 LHS.getOpcode() == ISD::FABS) {
16620 const unsigned IsInfMask =
16622 const unsigned IsFiniteMask =
16636SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
16637 DAGCombinerInfo &DCI)
const {
16638 SelectionDAG &DAG = DCI.DAG;
16659 unsigned ShiftOffset = 8 *
Offset;
16661 ShiftOffset -=
C->getZExtValue();
16663 ShiftOffset +=
C->getZExtValue();
16665 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16667 MVT::f32, Shifted);
16678 DCI.AddToWorklist(
N);
16685 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
16691 DAGCombinerInfo &DCI)
const {
16696 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16700 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16701 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
16704 APFloat One(
F.getSemantics(),
"1.0");
16706 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
16712 DAGCombinerInfo &DCI)
const {
16733 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
16734 bool isInteger =
LHS.getValueType().isInteger();
16737 if (!isFloatingPoint && !isInteger)
16742 if (!isEquality && !isNonEquality)
16759 if (isFloatingPoint) {
16761 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16772 if (!(isEquality && TrueVal == ConstVal) &&
16773 !(isNonEquality && FalseVal == ConstVal))
16780 SelectLHS, SelectRHS);
16785 switch (
N->getOpcode()) {
16801 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
16811 switch (
N->getOpcode()) {
16813 return performAddCombine(
N, DCI);
16815 return performPtrAddCombine(
N, DCI);
16817 return performSubCombine(
N, DCI);
16820 return performAddCarrySubCarryCombine(
N, DCI);
16822 return performFAddCombine(
N, DCI);
16824 return performFSubCombine(
N, DCI);
16826 return performFDivCombine(
N, DCI);
16828 return performFMulCombine(
N, DCI);
16830 return performSetCCCombine(
N, DCI);
16832 if (
auto Res = performSelectCombine(
N, DCI))
16837 case ISD::FMAXNUM_IEEE:
16838 case ISD::FMINNUM_IEEE:
16839 case ISD::FMAXIMUM:
16840 case ISD::FMINIMUM:
16841 case ISD::FMAXIMUMNUM:
16842 case ISD::FMINIMUMNUM:
16849 return performMinMaxCombine(
N, DCI);
16851 return performFMACombine(
N, DCI);
16853 return performAndCombine(
N, DCI);
16855 return performOrCombine(
N, DCI);
16858 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
16859 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16865 return performXorCombine(
N, DCI);
16867 return performZeroExtendCombine(
N, DCI);
16869 return performSignExtendInRegCombine(
N, DCI);
16871 return performClassCombine(
N, DCI);
16873 return performFCanonicalizeCombine(
N, DCI);
16875 return performRcpCombine(
N, DCI);
16890 return performUCharToFloatCombine(
N, DCI);
16892 return performFCopySignCombine(
N, DCI);
16897 return performCvtF32UByteNCombine(
N, DCI);
16899 return performFMed3Combine(
N, DCI);
16901 return performCvtPkRTZCombine(
N, DCI);
16903 return performClampCombine(
N, DCI);
16906 EVT VT =
N->getValueType(0);
16909 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16912 EVT EltVT = Src.getValueType();
16913 if (EltVT != MVT::i16)
16914 Src = DAG.
getNode(ISD::BITCAST, SL, MVT::i16, Src);
16917 return DAG.
getNode(ISD::BITCAST, SL, VT, Ext);
16923 return performExtractVectorEltCombine(
N, DCI);
16925 return performInsertVectorEltCombine(
N, DCI);
16927 return performFPRoundCombine(
N, DCI);
16936 return performMemSDNodeCombine(MemNode, DCI);
16967 unsigned Opcode =
Node->getMachineOpcode();
16970 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16971 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
16974 SDNode *
Users[5] = {
nullptr};
16976 unsigned DmaskIdx =
16977 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16978 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
16979 unsigned NewDmask = 0;
16980 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16981 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16982 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
16983 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
16984 unsigned TFCLane = 0;
16985 bool HasChain =
Node->getNumValues() > 1;
16987 if (OldDmask == 0) {
16995 TFCLane = OldBitsSet;
16999 for (SDUse &Use :
Node->uses()) {
17002 if (
Use.getResNo() != 0)
17005 SDNode *
User =
Use.getUser();
17008 if (!
User->isMachineOpcode() ||
17009 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17021 if (UsesTFC && Lane == TFCLane) {
17026 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17028 Dmask &= ~(1 << Comp);
17036 NewDmask |= 1 << Comp;
17041 bool NoChannels = !NewDmask;
17048 if (OldBitsSet == 1)
17054 if (NewDmask == OldDmask)
17063 unsigned NewChannels = BitsSet + UsesTFC;
17067 assert(NewOpcode != -1 &&
17068 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
17069 "failed to find equivalent MIMG op");
17077 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
17079 MVT ResultVT = NewChannels == 1
17082 : NewChannels == 5 ? 8
17084 SDVTList NewVTList =
17087 MachineSDNode *NewNode =
17096 if (NewChannels == 1) {
17106 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17111 if (i || !NoChannels)
17116 if (NewUser != User) {
17126 Idx = AMDGPU::sub1;
17129 Idx = AMDGPU::sub2;
17132 Idx = AMDGPU::sub3;
17135 Idx = AMDGPU::sub4;
17146 Op =
Op.getOperand(0);
17167 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17171 Node->getOperand(0), SL, VReg, SrcVal,
17177 return ToResultReg.
getNode();
17182 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
17184 Ops.push_back(
Node->getOperand(i));
17190 Node->getOperand(i).getValueType(),
17191 Node->getOperand(i)),
17203 unsigned Opcode =
Node->getMachineOpcode();
17205 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
17206 !
TII->isGather4(Opcode) &&
17208 return adjustWritemask(
Node, DAG);
17211 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17217 case AMDGPU::V_DIV_SCALE_F32_e64:
17218 case AMDGPU::V_DIV_SCALE_F64_e64: {
17228 (Src0 == Src1 || Src0 == Src2))
17284 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
17285 unsigned InitIdx = 0;
17287 if (
TII->isImage(
MI)) {
17295 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
17296 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
17297 unsigned D16Val = D16 ? D16->getImm() : 0;
17299 if (!TFEVal && !LWEVal)
17310 assert(MO_Dmask &&
"Expected dmask operand in instruction");
17312 unsigned dmask = MO_Dmask->
getImm();
17317 bool Packed = !Subtarget->hasUnpackedD16VMem();
17319 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17325 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
17326 if (DstSize < InitIdx)
17329 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
17337 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
17338 unsigned NewDst = 0;
17343 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17344 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17347 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17348 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
17368 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
17381 if (
TII->isVOP3(
MI.getOpcode())) {
17383 TII->legalizeOperandsVOP3(
MRI,
MI);
17388 if (!
MI.getDesc().operands().empty()) {
17389 unsigned Opc =
MI.getOpcode();
17390 bool HasAGPRs = Info->mayNeedAGPRs();
17392 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
17394 {AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src0),
17395 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1), Src2Idx}) {
17398 if ((
I == Src2Idx) && (HasAGPRs))
17401 if (!
Op.isReg() || !
Op.getReg().isVirtual())
17403 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
17404 if (!
TRI->hasAGPRs(RC))
17406 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
17407 if (!Src || !Src->isCopy() ||
17408 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
17410 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
17414 MRI.setRegClass(
Op.getReg(), NewRC);
17417 if (
TII->isMAI(
MI)) {
17422 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17423 AMDGPU::OpName::scale_src0);
17424 if (Src0Idx != -1) {
17425 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17426 AMDGPU::OpName::scale_src1);
17427 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
17428 TII->usesConstantBus(
MRI,
MI, Src1Idx))
17429 TII->legalizeOpWithMove(
MI, Src1Idx);
17437 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
17438 if (Src2->isReg() && Src2->getReg().isVirtual()) {
17439 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
17440 if (
TRI->isVectorSuperClass(RC)) {
17441 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
17442 MRI.setRegClass(Src2->getReg(), NewRC);
17443 if (Src2->isTied())
17444 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
17453 if (
TII->isImage(
MI))
17454 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
17528std::pair<unsigned, const TargetRegisterClass *>
17535 if (Constraint.
size() == 1) {
17539 if (VT == MVT::Other)
17542 switch (Constraint[0]) {
17549 RC = &AMDGPU::SReg_32RegClass;
17552 RC = &AMDGPU::SGPR_64RegClass;
17557 return std::pair(0U,
nullptr);
17564 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17565 : &AMDGPU::VGPR_32_Lo256RegClass;
17568 RC = Subtarget->has1024AddressableVGPRs()
17569 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
17572 return std::pair(0U,
nullptr);
17577 if (!Subtarget->hasMAIInsts())
17581 RC = &AMDGPU::AGPR_32RegClass;
17586 return std::pair(0U,
nullptr);
17591 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
17595 RC = &AMDGPU::AV_32RegClass;
17598 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
17600 return std::pair(0U,
nullptr);
17609 return std::pair(0U, RC);
17612 if (Kind !=
'\0') {
17614 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17615 }
else if (Kind ==
's') {
17616 RC = &AMDGPU::SGPR_32RegClass;
17617 }
else if (Kind ==
'a') {
17618 RC = &AMDGPU::AGPR_32RegClass;
17624 return std::pair(0U,
nullptr);
17630 return std::pair(0U,
nullptr);
17634 RC =
TRI->getVGPRClassForBitWidth(Width);
17636 RC =
TRI->getSGPRClassForBitWidth(Width);
17638 RC =
TRI->getAGPRClassForBitWidth(Width);
17640 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17645 return std::pair(0U,
nullptr);
17647 return std::pair(Reg, RC);
17653 return std::pair(0U,
nullptr);
17654 if (Idx < RC->getNumRegs())
17656 return std::pair(0U,
nullptr);
17662 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
17668 if (Constraint.
size() == 1) {
17669 switch (Constraint[0]) {
17679 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
17687 if (Constraint.
size() == 1) {
17688 switch (Constraint[0]) {
17696 }
else if (Constraint.
size() == 2) {
17697 if (Constraint ==
"VA")
17715 std::vector<SDValue> &
Ops,
17730 unsigned Size =
Op.getScalarValueSizeInBits();
17734 if (
Size == 16 && !Subtarget->has16BitInsts())
17738 Val =
C->getSExtValue();
17742 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17746 if (
Size != 16 ||
Op.getNumOperands() != 2)
17748 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
17751 Val =
C->getSExtValue();
17755 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17765 if (Constraint.
size() == 1) {
17766 switch (Constraint[0]) {
17781 }
else if (Constraint.
size() == 2) {
17782 if (Constraint ==
"DA") {
17783 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
17784 int64_t LoBits =
static_cast<int32_t
>(Val);
17788 if (Constraint ==
"DB") {
17796 unsigned MaxSize)
const {
17797 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
17798 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17800 MVT VT =
Op.getSimpleValueType();
17825 switch (UnalignedClassID) {
17826 case AMDGPU::VReg_64RegClassID:
17827 return AMDGPU::VReg_64_Align2RegClassID;
17828 case AMDGPU::VReg_96RegClassID:
17829 return AMDGPU::VReg_96_Align2RegClassID;
17830 case AMDGPU::VReg_128RegClassID:
17831 return AMDGPU::VReg_128_Align2RegClassID;
17832 case AMDGPU::VReg_160RegClassID:
17833 return AMDGPU::VReg_160_Align2RegClassID;
17834 case AMDGPU::VReg_192RegClassID:
17835 return AMDGPU::VReg_192_Align2RegClassID;
17836 case AMDGPU::VReg_224RegClassID:
17837 return AMDGPU::VReg_224_Align2RegClassID;
17838 case AMDGPU::VReg_256RegClassID:
17839 return AMDGPU::VReg_256_Align2RegClassID;
17840 case AMDGPU::VReg_288RegClassID:
17841 return AMDGPU::VReg_288_Align2RegClassID;
17842 case AMDGPU::VReg_320RegClassID:
17843 return AMDGPU::VReg_320_Align2RegClassID;
17844 case AMDGPU::VReg_352RegClassID:
17845 return AMDGPU::VReg_352_Align2RegClassID;
17846 case AMDGPU::VReg_384RegClassID:
17847 return AMDGPU::VReg_384_Align2RegClassID;
17848 case AMDGPU::VReg_512RegClassID:
17849 return AMDGPU::VReg_512_Align2RegClassID;
17850 case AMDGPU::VReg_1024RegClassID:
17851 return AMDGPU::VReg_1024_Align2RegClassID;
17852 case AMDGPU::AReg_64RegClassID:
17853 return AMDGPU::AReg_64_Align2RegClassID;
17854 case AMDGPU::AReg_96RegClassID:
17855 return AMDGPU::AReg_96_Align2RegClassID;
17856 case AMDGPU::AReg_128RegClassID:
17857 return AMDGPU::AReg_128_Align2RegClassID;
17858 case AMDGPU::AReg_160RegClassID:
17859 return AMDGPU::AReg_160_Align2RegClassID;
17860 case AMDGPU::AReg_192RegClassID:
17861 return AMDGPU::AReg_192_Align2RegClassID;
17862 case AMDGPU::AReg_256RegClassID:
17863 return AMDGPU::AReg_256_Align2RegClassID;
17864 case AMDGPU::AReg_512RegClassID:
17865 return AMDGPU::AReg_512_Align2RegClassID;
17866 case AMDGPU::AReg_1024RegClassID:
17867 return AMDGPU::AReg_1024_Align2RegClassID;
17883 if (Info->isEntryFunction()) {
17890 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17892 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17893 :
TRI->getAlignedHighSGPRForRC(MF, 2,
17894 &AMDGPU::SGPR_64RegClass);
17895 Info->setSGPRForEXECCopy(SReg);
17897 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
17898 Info->getStackPtrOffsetReg()));
17899 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17900 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17904 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17905 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17907 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17908 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17910 Info->limitOccupancy(MF);
17912 if (ST.isWave32() && !MF.
empty()) {
17913 for (
auto &
MBB : MF) {
17914 for (
auto &
MI :
MBB) {
17915 TII->fixImplicitOperands(
MI);
17925 if (ST.needsAlignedVGPRs()) {
17926 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
17932 if (NewClassID != -1)
17933 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
17942 const APInt &DemandedElts,
17944 unsigned Depth)
const {
17946 unsigned Opc =
Op.getOpcode();
17949 unsigned IID =
Op.getConstantOperandVal(0);
17951 case Intrinsic::amdgcn_mbcnt_lo:
17952 case Intrinsic::amdgcn_mbcnt_hi: {
17958 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17968 Op, Known, DemandedElts, DAG,
Depth);
17984 unsigned MaxValue =
17991 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
17995 unsigned Src1Cst = 0;
17996 if (Src1.
isImm()) {
17997 Src1Cst = Src1.
getImm();
17998 }
else if (Src1.
isReg()) {
18002 Src1Cst = Cst->Value.getZExtValue();
18013 if (Width >= BFEWidth)
18022 Known = Known.
sext(BFEWidth);
18024 Known = Known.
zext(BFEWidth);
18030 unsigned Depth)
const {
18033 switch (
MI->getOpcode()) {
18034 case AMDGPU::S_BFE_I32:
18037 case AMDGPU::S_BFE_U32:
18040 case AMDGPU::S_BFE_I64:
18043 case AMDGPU::S_BFE_U64:
18046 case AMDGPU::G_INTRINSIC:
18047 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18050 case Intrinsic::amdgcn_workitem_id_x:
18053 case Intrinsic::amdgcn_workitem_id_y:
18056 case Intrinsic::amdgcn_workitem_id_z:
18059 case Intrinsic::amdgcn_mbcnt_lo:
18060 case Intrinsic::amdgcn_mbcnt_hi: {
18072 case Intrinsic::amdgcn_groupstaticsize: {
18083 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18086 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18089 case AMDGPU::G_AMDGPU_SMED3:
18090 case AMDGPU::G_AMDGPU_UMED3: {
18091 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
18118 unsigned Depth)
const {
18125 AttributeList Attrs =
18127 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
18154 if (Header->getAlignment() != PrefAlign)
18155 return Header->getAlignment();
18157 unsigned LoopSize = 0;
18162 LoopSize +=
MBB->getAlignment().value() / 2;
18165 LoopSize +=
TII->getInstSizeInBytes(
MI);
18166 if (LoopSize > 192)
18171 if (LoopSize <= 64)
18174 if (LoopSize <= 128)
18175 return CacheLineAlign;
18181 auto I = Exit->getFirstNonDebugInstr();
18182 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18183 return CacheLineAlign;
18192 if (PreTerm == Pre->
begin() ||
18193 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18197 auto ExitHead = Exit->getFirstNonDebugInstr();
18198 if (ExitHead == Exit->end() ||
18199 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18204 return CacheLineAlign;
18212 N =
N->getOperand(0).getNode();
18213 if (
N->getOpcode() == ISD::INLINEASM ||
N->getOpcode() == ISD::INLINEASM_BR)
18222 switch (
N->getOpcode()) {
18230 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
18231 return !
TRI->isSGPRReg(
MRI, Reg);
18237 return !
TRI->isSGPRReg(
MRI, Reg);
18241 unsigned AS = L->getAddressSpace();
18245 case ISD::CALLSEQ_END:
18274 return A->readMem() &&
A->writeMem();
18295 switch (Ty.getScalarSizeInBits()) {
18307 const APInt &DemandedElts,
18310 unsigned Depth)
const {
18315 if (Info->getMode().DX10Clamp)
18327 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
18347 <<
"Hardware instruction generated for atomic "
18349 <<
" operation at memory scope " << MemScope;
18354 Type *EltTy = VT->getElementType();
18355 return VT->getNumElements() == 2 &&
18375 unsigned BW =
IT->getBitWidth();
18376 return BW == 32 || BW == 64;
18390 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
18391 return BW == 32 || BW == 64;
18394 if (Ty->isFloatTy() || Ty->isDoubleTy())
18398 return VT->getNumElements() == 2 &&
18399 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18409 bool HasSystemScope) {
18416 if (HasSystemScope) {
18425 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
18438 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
18464 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
18477 bool HasSystemScope =
18503 if (Subtarget->hasEmulatedSystemScopeAtomics())
18519 if (!HasSystemScope &&
18520 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18532 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
18540 ConstVal && ConstVal->isNullValue())
18578 if (Ty->isFloatTy()) {
18583 if (Ty->isDoubleTy()) {
18604 if (Ty->isFloatTy() &&
18605 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18618 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18622 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
18626 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18631 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
18636 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18640 if (Ty->isFloatTy()) {
18643 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18646 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18651 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18659 if (Subtarget->hasFlatAtomicFaddF32Inst())
18668 if (Subtarget->hasLDSFPAtomicAddF32()) {
18669 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18671 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18699 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18701 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18705 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18707 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18760 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18761 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18762 : &AMDGPU::SReg_32RegClass;
18763 if (!
TRI->isSGPRClass(RC) && !isDivergent)
18764 return TRI->getEquivalentSGPRClass(RC);
18765 if (
TRI->isSGPRClass(RC) && isDivergent)
18766 return TRI->getEquivalentVGPRClass(RC);
18778 unsigned WaveSize) {
18783 if (!
IT ||
IT->getBitWidth() != WaveSize)
18788 if (!Visited.
insert(V).second)
18790 bool Result =
false;
18791 for (
const auto *U : V->users()) {
18793 if (V == U->getOperand(1)) {
18798 case Intrinsic::amdgcn_if_break:
18799 case Intrinsic::amdgcn_if:
18800 case Intrinsic::amdgcn_else:
18805 if (V == U->getOperand(0)) {
18810 case Intrinsic::amdgcn_end_cf:
18811 case Intrinsic::amdgcn_loop:
18817 Result =
hasCFUser(U, Visited, WaveSize);
18826 const Value *V)
const {
18828 if (CI->isInlineAsm()) {
18837 for (
auto &TC : TargetConstraints) {
18851 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18879 return MRI.hasOneNonDBGUse(N0);
18886 if (
I.getMetadata(
"amdgpu.noclobber"))
18888 if (
I.getMetadata(
"amdgpu.last.use"))
18898 if (!Def->isMachineOpcode())
18908 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18909 PhysReg = AMDGPU::SCC;
18911 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18977 Alignment = RMW->getAlign();
18990 bool FullFlatEmulation =
18992 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18993 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18994 RMW->getType()->isDoubleTy()));
18997 bool ReturnValueIsUsed = !AI->
use_empty();
19006 if (FullFlatEmulation) {
19017 std::prev(BB->
end())->eraseFromParent();
19018 Builder.SetInsertPoint(BB);
19020 Value *LoadedShared =
nullptr;
19021 if (FullFlatEmulation) {
19022 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19023 {Addr},
nullptr,
"is.shared");
19024 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19025 Builder.SetInsertPoint(SharedBB);
19026 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19032 LoadedShared = Clone;
19034 Builder.CreateBr(PhiBB);
19035 Builder.SetInsertPoint(CheckPrivateBB);
19038 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19039 {Addr},
nullptr,
"is.private");
19040 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19042 Builder.SetInsertPoint(PrivateBB);
19044 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19047 Value *LoadedPrivate;
19049 LoadedPrivate = Builder.CreateAlignedLoad(
19050 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
19053 LoadedPrivate, RMW->getValOperand());
19055 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19057 auto [ResultLoad, Equal] =
19063 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19066 Builder.CreateBr(PhiBB);
19068 Builder.SetInsertPoint(GlobalBB);
19072 if (FullFlatEmulation) {
19073 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19082 if (!FullFlatEmulation) {
19087 MDNode *RangeNotPrivate =
19090 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
19094 Builder.CreateBr(PhiBB);
19096 Builder.SetInsertPoint(PhiBB);
19098 if (ReturnValueIsUsed) {
19101 if (FullFlatEmulation)
19108 Builder.CreateBr(ExitBB);
19112 unsigned PtrOpIdx) {
19113 Value *PtrOp =
I->getOperand(PtrOpIdx);
19120 I->setOperand(PtrOpIdx, ASCast);
19132 ConstVal && ConstVal->isNullValue()) {
19162 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19170 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19185 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ TC_RETURN_GFX_WholeWave
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ SMULO
Same for multiplication.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const