41#include "llvm/IR/IntrinsicsAMDGPU.h"
42#include "llvm/IR/IntrinsicsR600.h"
53#define DEBUG_TYPE "si-lower"
59 cl::desc(
"Do not align and prefetch loops"),
63 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
64 cl::desc(
"Use indirect register addressing for divergent indexes"),
78 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
79 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
81 return AMDGPU::SGPR0 +
Reg;
153 if (Subtarget->has16BitInsts()) {
154 if (Subtarget->useRealTrue16Insts()) {
196 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
197 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
198 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
199 MVT::i1, MVT::v32i32},
203 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
204 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
205 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
206 MVT::i1, MVT::v32i32},
213 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
214 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
215 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
216 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
217 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
275 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
282 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
283 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
284 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
287 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
288 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
289 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
293 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
294 MVT::v3i16, MVT::v4i16, MVT::Other},
299 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
315 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
316 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
317 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
318 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
319 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
320 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
321 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
322 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
354 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
368 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
382 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
396 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
410 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
425 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
426 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
429 if (Subtarget->hasPkMovB32()) {
450 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
451 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
456 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
460 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
461 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
462 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
463 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
487 if (Subtarget->hasSMemRealTime() ||
492 if (Subtarget->has16BitInsts()) {
499 if (Subtarget->hasMadMacF32Insts())
502 if (!Subtarget->hasBFI())
506 if (!Subtarget->hasBCNT(32))
509 if (!Subtarget->hasBCNT(64))
512 if (Subtarget->hasFFBH())
515 if (Subtarget->hasFFBL())
526 if (Subtarget->hasBFE())
530 if (Subtarget->hasIntClamp())
533 if (Subtarget->hasAddNoCarry())
538 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
539 {MVT::f32, MVT::f64},
Custom);
545 {MVT::f32, MVT::f64},
Legal);
547 if (Subtarget->haveRoundOpsF64())
570 if (Subtarget->has16BitInsts()) {
619 ISD::FSIN, ISD::FROUND},
623 if (Subtarget->hasBF16TransInsts())
642 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
643 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
644 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
777 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
778 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
779 MVT::v32f16, MVT::v32bf16},
783 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
789 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
793 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
797 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
798 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
806 if (Subtarget->hasVOP3PInsts()) {
817 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
820 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
821 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
822 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
825 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
833 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
839 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
840 {MVT::v2f16, MVT::v4f16},
Custom);
846 if (Subtarget->hasBF16PackedInsts()) {
847 for (
MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
853 if (Subtarget->hasPackedFP32Ops()) {
857 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
864 if (Subtarget->has16BitInsts()) {
877 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
878 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
879 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
880 MVT::v32f16, MVT::v32bf16},
885 if (Subtarget->hasVectorMulU64())
887 else if (Subtarget->hasScalarSMulU64())
890 if (Subtarget->hasMad64_32())
893 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
896 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
898 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
901 if (Subtarget->hasMinimum3Maximum3F32())
904 if (Subtarget->hasMinimum3Maximum3PKF16()) {
908 if (!Subtarget->hasMinimum3Maximum3F16())
913 if (Subtarget->hasVOP3PInsts()) {
916 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
920 if (Subtarget->hasIntMinMax64())
925 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
926 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
931 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
932 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
933 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
934 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
938 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
939 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
940 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
941 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
956 if (Subtarget->hasBF16ConversionInsts()) {
961 if (Subtarget->hasBF16PackedInsts()) {
967 if (Subtarget->hasBF16TransInsts()) {
971 if (Subtarget->hasCvtPkF16F32Inst()) {
973 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1023 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1032 ISD::ATOMIC_CMP_SWAP,
1033 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1035 ISD::ATOMIC_LOAD_ADD,
1036 ISD::ATOMIC_LOAD_SUB,
1037 ISD::ATOMIC_LOAD_AND,
1038 ISD::ATOMIC_LOAD_OR,
1039 ISD::ATOMIC_LOAD_XOR,
1040 ISD::ATOMIC_LOAD_NAND,
1041 ISD::ATOMIC_LOAD_MIN,
1042 ISD::ATOMIC_LOAD_MAX,
1043 ISD::ATOMIC_LOAD_UMIN,
1044 ISD::ATOMIC_LOAD_UMAX,
1045 ISD::ATOMIC_LOAD_FADD,
1046 ISD::ATOMIC_LOAD_FMIN,
1047 ISD::ATOMIC_LOAD_FMAX,
1048 ISD::ATOMIC_LOAD_UINC_WRAP,
1049 ISD::ATOMIC_LOAD_UDEC_WRAP,
1062 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1075 EVT DestVT,
EVT SrcVT)
const {
1077 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1078 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1080 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1087 LLT DestTy,
LLT SrcTy)
const {
1088 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1089 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1091 SrcTy.getScalarSizeInBits() == 16 &&
1112 if (Subtarget->has16BitInsts()) {
1115 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1117 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1121 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1143 if (
Size == 16 && Subtarget->has16BitInsts())
1144 return (NumElts + 1) / 2;
1150 return NumElts * ((
Size + 31) / 32);
1159 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1167 if (
Size == 16 && Subtarget->has16BitInsts()) {
1168 if (ScalarVT == MVT::bf16) {
1169 RegisterVT = MVT::i32;
1170 IntermediateVT = MVT::v2bf16;
1172 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1173 IntermediateVT = RegisterVT;
1175 NumIntermediates = (NumElts + 1) / 2;
1176 return NumIntermediates;
1181 IntermediateVT = RegisterVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1188 RegisterVT = MVT::i16;
1189 IntermediateVT = ScalarVT;
1190 NumIntermediates = NumElts;
1191 return NumIntermediates;
1195 RegisterVT = MVT::i32;
1196 IntermediateVT = ScalarVT;
1197 NumIntermediates = NumElts;
1198 return NumIntermediates;
1202 RegisterVT = MVT::i32;
1203 IntermediateVT = RegisterVT;
1204 NumIntermediates = NumElts * ((
Size + 31) / 32);
1205 return NumIntermediates;
1210 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1215 unsigned MaxNumLanes) {
1216 assert(MaxNumLanes != 0);
1220 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1231 unsigned MaxNumLanes) {
1237 assert(ST->getNumContainedTypes() == 2 &&
1238 ST->getContainedType(1)->isIntegerTy(32));
1252 return MVT::amdgpuBufferFatPointer;
1254 DL.getPointerSizeInBits(AS) == 192)
1255 return MVT::amdgpuBufferStridedPointer;
1264 DL.getPointerSizeInBits(AS) == 160) ||
1266 DL.getPointerSizeInBits(AS) == 192))
1273 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1274 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1275 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1277 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1278 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1279 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1280 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1281 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1283 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1284 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1285 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1286 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1287 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1289 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1290 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1291 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1292 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1293 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1332 unsigned IntrID)
const {
1334 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1352 if (RsrcIntr->IsImage) {
1367 Info.ptrVal = RsrcArg;
1370 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1379 if (RsrcIntr->IsImage) {
1380 unsigned MaxNumLanes = 4;
1395 std::numeric_limits<unsigned>::max());
1405 if (RsrcIntr->IsImage) {
1426 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1428 Info.memVT = MVT::i32;
1435 case Intrinsic::amdgcn_raw_buffer_load_lds:
1436 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1437 case Intrinsic::amdgcn_struct_buffer_load_lds:
1438 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1444 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1445 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1446 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1447 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1450 std::numeric_limits<unsigned>::max());
1460 case Intrinsic::amdgcn_ds_ordered_add:
1461 case Intrinsic::amdgcn_ds_ordered_swap: {
1474 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1475 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1478 Info.ptrVal =
nullptr;
1483 case Intrinsic::amdgcn_ds_append:
1484 case Intrinsic::amdgcn_ds_consume: {
1497 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1498 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1499 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1504 Info.memVT = MVT::i64;
1510 case Intrinsic::amdgcn_global_atomic_csub: {
1519 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1520 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1521 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1524 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1527 ->getElementType(0));
1535 case Intrinsic::amdgcn_global_atomic_fmin_num:
1536 case Intrinsic::amdgcn_global_atomic_fmax_num:
1537 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1538 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1539 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1540 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1550 case Intrinsic::amdgcn_flat_load_monitor_b32:
1551 case Intrinsic::amdgcn_flat_load_monitor_b64:
1552 case Intrinsic::amdgcn_flat_load_monitor_b128:
1553 case Intrinsic::amdgcn_global_load_monitor_b32:
1554 case Intrinsic::amdgcn_global_load_monitor_b64:
1555 case Intrinsic::amdgcn_global_load_monitor_b128:
1556 case Intrinsic::amdgcn_cluster_load_b32:
1557 case Intrinsic::amdgcn_cluster_load_b64:
1558 case Intrinsic::amdgcn_cluster_load_b128:
1559 case Intrinsic::amdgcn_ds_load_tr6_b96:
1560 case Intrinsic::amdgcn_ds_load_tr4_b64:
1561 case Intrinsic::amdgcn_ds_load_tr8_b64:
1562 case Intrinsic::amdgcn_ds_load_tr16_b128:
1563 case Intrinsic::amdgcn_global_load_tr6_b96:
1564 case Intrinsic::amdgcn_global_load_tr4_b64:
1565 case Intrinsic::amdgcn_global_load_tr_b64:
1566 case Intrinsic::amdgcn_global_load_tr_b128:
1567 case Intrinsic::amdgcn_ds_read_tr4_b64:
1568 case Intrinsic::amdgcn_ds_read_tr6_b96:
1569 case Intrinsic::amdgcn_ds_read_tr8_b64:
1570 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1578 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1579 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1580 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1588 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1589 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1590 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1598 case Intrinsic::amdgcn_ds_gws_init:
1599 case Intrinsic::amdgcn_ds_gws_barrier:
1600 case Intrinsic::amdgcn_ds_gws_sema_v:
1601 case Intrinsic::amdgcn_ds_gws_sema_br:
1602 case Intrinsic::amdgcn_ds_gws_sema_p:
1603 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1613 Info.memVT = MVT::i32;
1615 Info.align =
Align(4);
1617 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1623 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1624 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1625 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1626 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1627 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1628 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1629 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1630 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1637 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1638 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1639 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1640 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1647 case Intrinsic::amdgcn_load_to_lds:
1648 case Intrinsic::amdgcn_global_load_lds: {
1656 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1657 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1658 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1659 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1669 Info.memVT = MVT::i32;
1671 Info.align =
Align(4);
1676 case Intrinsic::amdgcn_s_prefetch_data:
1677 case Intrinsic::amdgcn_flat_prefetch:
1678 case Intrinsic::amdgcn_global_prefetch: {
1693 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1696 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1697 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1709 Type *&AccessTy)
const {
1711 switch (
II->getIntrinsicID()) {
1712 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1713 case Intrinsic::amdgcn_cluster_load_b128:
1714 case Intrinsic::amdgcn_cluster_load_b64:
1715 case Intrinsic::amdgcn_cluster_load_b32:
1716 case Intrinsic::amdgcn_ds_append:
1717 case Intrinsic::amdgcn_ds_consume:
1718 case Intrinsic::amdgcn_ds_load_tr8_b64:
1719 case Intrinsic::amdgcn_ds_load_tr16_b128:
1720 case Intrinsic::amdgcn_ds_load_tr4_b64:
1721 case Intrinsic::amdgcn_ds_load_tr6_b96:
1722 case Intrinsic::amdgcn_ds_read_tr4_b64:
1723 case Intrinsic::amdgcn_ds_read_tr6_b96:
1724 case Intrinsic::amdgcn_ds_read_tr8_b64:
1725 case Intrinsic::amdgcn_ds_read_tr16_b64:
1726 case Intrinsic::amdgcn_ds_ordered_add:
1727 case Intrinsic::amdgcn_ds_ordered_swap:
1728 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1729 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1730 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1731 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1732 case Intrinsic::amdgcn_flat_load_monitor_b128:
1733 case Intrinsic::amdgcn_flat_load_monitor_b32:
1734 case Intrinsic::amdgcn_flat_load_monitor_b64:
1735 case Intrinsic::amdgcn_global_atomic_csub:
1736 case Intrinsic::amdgcn_global_atomic_fmax_num:
1737 case Intrinsic::amdgcn_global_atomic_fmin_num:
1738 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1739 case Intrinsic::amdgcn_global_load_monitor_b128:
1740 case Intrinsic::amdgcn_global_load_monitor_b32:
1741 case Intrinsic::amdgcn_global_load_monitor_b64:
1742 case Intrinsic::amdgcn_global_load_tr_b64:
1743 case Intrinsic::amdgcn_global_load_tr_b128:
1744 case Intrinsic::amdgcn_global_load_tr4_b64:
1745 case Intrinsic::amdgcn_global_load_tr6_b96:
1746 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1747 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1748 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1749 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1750 Ptr =
II->getArgOperand(0);
1752 case Intrinsic::amdgcn_load_to_lds:
1753 case Intrinsic::amdgcn_global_load_lds:
1754 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1755 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1756 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1757 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1758 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1759 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1760 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1761 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1762 Ptr =
II->getArgOperand(1);
1767 AccessTy =
II->getType();
1773 unsigned AddrSpace)
const {
1774 if (!Subtarget->hasFlatInstOffsets()) {
1785 return AM.
Scale == 0 &&
1786 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1787 AM.
BaseOffs, AddrSpace, FlatVariant));
1791 if (Subtarget->hasFlatGlobalInsts())
1794 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1807 return isLegalMUBUFAddressingMode(AM);
1810bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1821 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1833 if (AM.HasBaseReg) {
1865 return isLegalMUBUFAddressingMode(AM);
1867 if (!Subtarget->hasScalarSubwordLoads()) {
1872 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1920 return Subtarget->enableFlatScratch()
1922 : isLegalMUBUFAddressingMode(AM);
1969 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1978 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
1981 Align RequiredAlignment(
1983 if (Subtarget->hasLDSMisalignedBug() &&
Size > 32 &&
1984 Alignment < RequiredAlignment)
1999 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
2005 RequiredAlignment =
Align(4);
2007 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2023 *IsFast = (Alignment >= RequiredAlignment) ? 64
2024 : (Alignment <
Align(4)) ? 32
2031 if (!Subtarget->hasDS96AndDS128())
2037 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2046 *IsFast = (Alignment >= RequiredAlignment) ? 96
2047 : (Alignment <
Align(4)) ? 32
2054 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2060 RequiredAlignment =
Align(8);
2062 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2071 *IsFast = (Alignment >= RequiredAlignment) ? 128
2072 : (Alignment <
Align(4)) ? 32
2089 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2091 return Alignment >= RequiredAlignment ||
2092 Subtarget->hasUnalignedDSAccessEnabled();
2100 bool AlignedBy4 = Alignment >=
Align(4);
2101 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2103 *IsFast = AlignedBy4 ?
Size : 1;
2108 *IsFast = AlignedBy4;
2119 return Alignment >=
Align(4) ||
2120 Subtarget->hasUnalignedBufferAccessEnabled();
2132 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2147 return Size >= 32 && Alignment >=
Align(4);
2152 unsigned *IsFast)
const {
2154 Alignment, Flags, IsFast);
2159 const AttributeList &FuncAttributes)
const {
2165 if (
Op.size() >= 16 &&
2169 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2187 unsigned DestAS)
const {
2190 Subtarget->hasGloballyAddressableScratch()) {
2220 unsigned Index)
const {
2236 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2263 auto [InputPtrReg, RC, ArgTy] =
2273 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2279 const SDLoc &SL)
const {
2286 const SDLoc &SL)
const {
2289 std::optional<uint32_t> KnownSize =
2291 if (KnownSize.has_value())
2317 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2326SDValue SITargetLowering::lowerKernargMemParameter(
2338 int64_t OffsetDiff =
Offset - AlignDownOffset;
2344 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2353 ArgVal = DAG.
getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2354 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2364 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2373 const SDLoc &SL)
const {
2383 return DAG.
getNode(ISD::BITCAST, SL, ValVT, Val);
2442 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2445 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2446 if (ConvertedVal == ArgValue)
2447 return ConvertedVal;
2452SDValue SITargetLowering::lowerWorkGroupId(
2457 if (!Subtarget->hasClusters())
2458 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2466 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2467 SDLoc SL(ClusterIdXYZ);
2468 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2471 SDValue ClusterWorkGroupIdXYZ =
2472 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2482 return ClusterIdXYZ;
2484 using namespace AMDGPU::Hwreg;
2488 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2499SDValue SITargetLowering::getPreloadedValue(
2502 const ArgDescriptor *
Reg =
nullptr;
2503 const TargetRegisterClass *RC;
2507 const ArgDescriptor WorkGroupIDX =
2515 const ArgDescriptor WorkGroupIDZ =
2517 const ArgDescriptor ClusterWorkGroupIDX =
2519 const ArgDescriptor ClusterWorkGroupIDY =
2521 const ArgDescriptor ClusterWorkGroupIDZ =
2523 const ArgDescriptor ClusterWorkGroupMaxIDX =
2525 const ArgDescriptor ClusterWorkGroupMaxIDY =
2527 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2529 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2532 auto LoadConstant = [&](
unsigned N) {
2536 if (Subtarget->hasArchitectedSGPRs() &&
2543 Reg = &WorkGroupIDX;
2544 RC = &AMDGPU::SReg_32RegClass;
2548 Reg = &WorkGroupIDY;
2549 RC = &AMDGPU::SReg_32RegClass;
2553 Reg = &WorkGroupIDZ;
2554 RC = &AMDGPU::SReg_32RegClass;
2558 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2559 return LoadConstant(0);
2560 Reg = &ClusterWorkGroupIDX;
2561 RC = &AMDGPU::SReg_32RegClass;
2565 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2566 return LoadConstant(0);
2567 Reg = &ClusterWorkGroupIDY;
2568 RC = &AMDGPU::SReg_32RegClass;
2572 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2573 return LoadConstant(0);
2574 Reg = &ClusterWorkGroupIDZ;
2575 RC = &AMDGPU::SReg_32RegClass;
2580 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2581 Reg = &ClusterWorkGroupMaxIDX;
2582 RC = &AMDGPU::SReg_32RegClass;
2587 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2588 Reg = &ClusterWorkGroupMaxIDY;
2589 RC = &AMDGPU::SReg_32RegClass;
2594 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2595 Reg = &ClusterWorkGroupMaxIDZ;
2596 RC = &AMDGPU::SReg_32RegClass;
2600 Reg = &ClusterWorkGroupMaxFlatID;
2601 RC = &AMDGPU::SReg_32RegClass;
2632 for (
unsigned I = 0,
E = Ins.size(), PSInputNum = 0;
I !=
E; ++
I) {
2636 "vector type argument should have been split");
2641 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2649 "unexpected vector split in ps argument type");
2663 Info->markPSInputAllocated(PSInputNum);
2665 Info->markPSInputEnabled(PSInputNum);
2681 if (Info.hasWorkItemIDX()) {
2687 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2691 if (Info.hasWorkItemIDY()) {
2692 assert(Info.hasWorkItemIDX());
2693 if (Subtarget->hasPackedTID()) {
2694 Info.setWorkItemIDY(
2697 unsigned Reg = AMDGPU::VGPR1;
2705 if (Info.hasWorkItemIDZ()) {
2706 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2707 if (Subtarget->hasPackedTID()) {
2708 Info.setWorkItemIDZ(
2711 unsigned Reg = AMDGPU::VGPR2;
2731 if (RegIdx == ArgVGPRs.
size()) {
2738 unsigned Reg = ArgVGPRs[RegIdx];
2750 unsigned NumArgRegs) {
2753 if (RegIdx == ArgSGPRs.
size())
2756 unsigned Reg = ArgSGPRs[RegIdx];
2798 const unsigned Mask = 0x3ff;
2801 if (Info.hasWorkItemIDX()) {
2803 Info.setWorkItemIDX(Arg);
2806 if (Info.hasWorkItemIDY()) {
2808 Info.setWorkItemIDY(Arg);
2811 if (Info.hasWorkItemIDZ())
2823 const unsigned Mask = 0x3ff;
2832 auto &
ArgInfo = Info.getArgInfo();
2844 if (Info.hasImplicitArgPtr())
2852 if (Info.hasWorkGroupIDX())
2855 if (Info.hasWorkGroupIDY())
2858 if (Info.hasWorkGroupIDZ())
2861 if (Info.hasLDSKernelId())
2872 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2873 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2879 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2880 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2885 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2886 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2892 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2898 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
2907 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2912 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
2913 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2918 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
2919 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2934 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2936 bool InPreloadSequence =
true;
2938 bool AlignedForImplictArgs =
false;
2939 unsigned ImplicitArgOffset = 0;
2940 for (
auto &Arg :
F.args()) {
2941 if (!InPreloadSequence || !Arg.hasInRegAttr())
2944 unsigned ArgIdx = Arg.getArgNo();
2947 if (InIdx < Ins.size() &&
2948 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2951 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2952 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2954 assert(ArgLocs[ArgIdx].isMemLoc());
2955 auto &ArgLoc = ArgLocs[InIdx];
2957 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2959 unsigned NumAllocSGPRs =
2960 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2963 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2964 if (!AlignedForImplictArgs) {
2966 alignTo(LastExplicitArgOffset,
2967 Subtarget->getAlignmentForImplicitArgPtr()) -
2968 LastExplicitArgOffset;
2969 AlignedForImplictArgs =
true;
2971 ArgOffset += ImplicitArgOffset;
2975 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2976 assert(InIdx >= 1 &&
"No previous SGPR");
2977 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2978 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2982 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2983 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2986 InPreloadSequence =
false;
2992 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2994 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2996 if (PreloadRegs->
size() > 1)
2997 RC = &AMDGPU::SGPR_32RegClass;
2998 for (
auto &Reg : *PreloadRegs) {
3004 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3013 if (Info.hasLDSKernelId()) {
3014 Register Reg = Info.addLDSKernelId();
3015 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3024 bool IsShader)
const {
3025 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3026 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3032 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3034 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3038 unsigned NumRequiredSystemSGPRs =
3039 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3040 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3041 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3042 Register Reg = Info.addReservedUserSGPR();
3043 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3048 if (!HasArchitectedSGPRs) {
3049 if (Info.hasWorkGroupIDX()) {
3050 Register Reg = Info.addWorkGroupIDX();
3051 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3055 if (Info.hasWorkGroupIDY()) {
3056 Register Reg = Info.addWorkGroupIDY();
3057 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3061 if (Info.hasWorkGroupIDZ()) {
3062 Register Reg = Info.addWorkGroupIDZ();
3063 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3068 if (Info.hasWorkGroupInfo()) {
3069 Register Reg = Info.addWorkGroupInfo();
3070 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3074 if (Info.hasPrivateSegmentWaveByteOffset()) {
3076 unsigned PrivateSegmentWaveByteOffsetReg;
3079 PrivateSegmentWaveByteOffsetReg =
3080 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3084 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3086 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3089 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3091 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3092 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3095 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3096 Info.getNumPreloadedSGPRs() >= 16);
3111 if (HasStackObjects)
3112 Info.setHasNonSpillStackObjects(
true);
3117 HasStackObjects =
true;
3121 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3123 if (!ST.enableFlatScratch()) {
3124 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3131 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3133 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3143 Info.setScratchRSrcReg(ReservedBufferReg);
3162 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
3163 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3170 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3171 if (!
MRI.isLiveIn(
Reg)) {
3172 Info.setStackPtrOffsetReg(
Reg);
3177 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3184 if (ST.getFrameLowering()->hasFP(MF)) {
3185 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3201 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3210 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3211 RC = &AMDGPU::SGPR_64RegClass;
3212 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3213 RC = &AMDGPU::SGPR_32RegClass;
3219 Entry->addLiveIn(*
I);
3224 for (
auto *Exit : Exits)
3226 TII->get(TargetOpcode::COPY), *
I)
3241 bool IsError =
false;
3245 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3263 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3264 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3266 if (!Subtarget->enableFlatScratch())
3271 !Subtarget->hasArchitectedSGPRs())
3272 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3273 !Info->hasWorkGroupIDZ());
3276 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3294 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3295 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3298 Info->markPSInputAllocated(0);
3299 Info->markPSInputEnabled(0);
3301 if (Subtarget->isAmdPalOS()) {
3310 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3311 if ((PsInputBits & 0x7F) == 0 ||
3312 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3315 }
else if (IsKernel) {
3316 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3318 Splits.
append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3328 if (IsKernel && Subtarget->hasKernargPreload())
3332 }
else if (!IsGraphics) {
3337 if (!Subtarget->enableFlatScratch())
3349 Info->setNumWaveDispatchSGPRs(
3351 Info->setNumWaveDispatchVGPRs(
3353 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3354 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3359 if (IsWholeWaveFunc) {
3361 {MVT::i1, MVT::Other}, Chain);
3373 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3384 if (IsEntryFunc && VA.
isMemLoc()) {
3407 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3411 int64_t OffsetDiff =
Offset - AlignDownOffset;
3418 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3428 ArgVal = DAG.
getNode(ISD::BITCAST,
DL, MemVT, ArgVal);
3429 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3430 Ins[i].Flags.isSExt(), &Ins[i]);
3438 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3441 if (PreloadRegs.
size() == 1) {
3442 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3447 TRI->getRegSizeInBits(*RC)));
3455 for (
auto Reg : PreloadRegs) {
3462 PreloadRegs.size()),
3479 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3480 Ins[i].Flags.isSExt(), &Ins[i]);
3492 "hidden argument in kernel signature was not preloaded",
3498 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3499 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3519 if (!IsEntryFunc && VA.
isMemLoc()) {
3520 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3531 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3532 RC = &AMDGPU::VGPR_32RegClass;
3533 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3534 RC = &AMDGPU::SGPR_32RegClass;
3554 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3570 Info->setBytesInStackArgArea(StackArgSize);
3572 return Chains.
empty() ? Chain
3581 const Type *RetTy)
const {
3589 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3594 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3595 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3596 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3597 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3620 Info->setIfReturnsVoid(Outs.
empty());
3621 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3640 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3641 ++
I, ++RealRVLocIdx) {
3645 SDValue Arg = OutVals[RealRVLocIdx];
3668 ReadFirstLane, Arg);
3675 if (!Info->isEntryFunction()) {
3681 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3683 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3784 auto &ArgUsageInfo =
3786 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3814 const auto [OutgoingArg, ArgRC, ArgTy] =
3819 const auto [IncomingArg, IncomingArgRC, Ty] =
3821 assert(IncomingArgRC == ArgRC);
3824 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3832 InputReg = getImplicitArgPtr(DAG,
DL);
3834 std::optional<uint32_t> Id =
3836 if (Id.has_value()) {
3847 if (OutgoingArg->isRegister()) {
3848 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3849 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3852 unsigned SpecialArgOffset =
3863 auto [OutgoingArg, ArgRC, Ty] =
3866 std::tie(OutgoingArg, ArgRC, Ty) =
3869 std::tie(OutgoingArg, ArgRC, Ty) =
3884 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3885 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3886 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3891 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3899 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
3909 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
3918 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3919 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3930 : IncomingArgY ? *IncomingArgY
3937 if (OutgoingArg->isRegister()) {
3939 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3965 if (Callee->isDivergent())
3972 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3976 if (!CallerPreserved)
3979 bool CCMatch = CallerCC == CalleeCC;
3992 if (Arg.hasByValAttr())
4006 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
4007 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4016 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4029 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4031 if (!CCVA.isRegLoc())
4036 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4038 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4062enum ChainCallArgIdx {
4084 bool UsesDynamicVGPRs =
false;
4085 if (IsChainCallConv) {
4090 auto RequestedExecIt =
4092 return Arg.OrigArgIndex == 2;
4094 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4096 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4099 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4102 "Haven't popped all the special args");
4105 CLI.
Args[ChainCallArgIdx::Exec];
4106 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4114 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4116 ChainCallSpecialArgs.
push_back(Arg.Node);
4119 PushNodeOrTargetConstant(RequestedExecArg);
4125 if (FlagsValue.
isZero()) {
4126 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4128 "no additional args allowed if flags == 0");
4130 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4134 if (!Subtarget->isWave32()) {
4136 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4139 UsesDynamicVGPRs =
true;
4140 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4141 CLI.
Args.end(), PushNodeOrTargetConstant);
4150 bool IsSibCall =
false;
4164 "unsupported call to variadic function ");
4172 "unsupported required tail call to function ");
4177 Outs, OutVals, Ins, DAG);
4181 "site marked musttail or on llvm.amdgcn.cs.chain");
4188 if (!TailCallOpt && IsTailCall)
4228 auto *
TRI = Subtarget->getRegisterInfo();
4235 if (!IsSibCall || IsChainCallConv) {
4236 if (!Subtarget->enableFlatScratch()) {
4242 RegsToPass.emplace_back(IsChainCallConv
4243 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4244 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4251 const unsigned NumSpecialInputs = RegsToPass.size();
4253 MVT PtrVT = MVT::i32;
4256 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4284 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4292 int32_t
Offset = LocMemOffset;
4299 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4305 ? Flags.getNonZeroByValAlign()
4332 if (Outs[i].Flags.isByVal()) {
4334 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4337 Outs[i].Flags.getNonZeroByValAlign(),
4339 nullptr, std::nullopt, DstInfo,
4345 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4351 if (!MemOpChains.
empty())
4359 TokenGlue = DAG.
getNode(ISD::CONVERGENCECTRL_GLUE,
DL, MVT::Glue,
4367 unsigned ArgIdx = 0;
4368 for (
auto [Reg, Val] : RegsToPass) {
4369 if (ArgIdx++ >= NumSpecialInputs &&
4370 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4396 if (IsTailCall && !IsSibCall) {
4401 std::vector<SDValue>
Ops({Chain});
4407 Ops.push_back(Callee);
4424 Ops.push_back(Callee);
4435 if (IsChainCallConv)
4440 for (
auto &[Reg, Val] : RegsToPass)
4444 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4445 assert(Mask &&
"Missing call preserved mask for calling convention");
4455 MVT::Glue, GlueOps),
4460 Ops.push_back(InGlue);
4480 if (Info->isWholeWaveFunction())
4488 Chain =
Call.getValue(0);
4489 InGlue =
Call.getValue(1);
4491 uint64_t CalleePopBytes = NumBytes;
4512 EVT VT =
Op.getValueType();
4526 "Stack grows upwards for AMDGPU");
4528 Chain = BaseAddr.getValue(1);
4530 if (Alignment > StackAlign) {
4532 << Subtarget->getWavefrontSizeLog2();
4533 uint64_t StackAlignMask = ScaledAlignment - 1;
4540 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4546 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4557 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4573 if (
Op.getValueType() != MVT::i32)
4592 assert(
Op.getValueType() == MVT::i32);
4601 Op.getOperand(0), IntrinID, GetRoundBothImm);
4635 SDValue RoundModeTimesNumBits =
4655 TableEntry, EnumOffset);
4671 static_cast<uint32_t>(ConstMode->getZExtValue()),
4683 if (UseReducedTable) {
4689 SDValue RoundModeTimesNumBits =
4709 SDValue RoundModeTimesNumBits =
4718 NewMode = TruncTable;
4727 ReadFirstLaneID, NewMode);
4740 IntrinID, RoundBothImm, NewMode);
4746 if (
Op->isDivergent() &&
4747 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4757 if (Subtarget->hasSafeSmemPrefetch())
4765 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4774 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4775 EVT SrcVT = Src.getValueType();
4784 EVT DstVT =
Op.getValueType();
4788 return DAG.
getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4793 if (
Op.getValueType() != MVT::i64)
4807 Op.getOperand(0), IntrinID, ModeHwRegImm);
4809 Op.getOperand(0), IntrinID, TrapHwRegImm);
4816 SDValue Result = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4823 if (
Op.getOperand(1).getValueType() != MVT::i64)
4835 ReadFirstLaneID, NewModeReg);
4837 ReadFirstLaneID, NewTrapReg);
4839 unsigned ModeHwReg =
4842 unsigned TrapHwReg =
4850 IntrinID, ModeHwRegImm, NewModeReg);
4853 IntrinID, TrapHwRegImm, NewTrapReg);
4862 .
Case(
"m0", AMDGPU::M0)
4863 .
Case(
"exec", AMDGPU::EXEC)
4864 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4865 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4866 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4867 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4868 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4873 if (!Subtarget->hasFlatScrRegister() &&
4874 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4876 "\" for subtarget."));
4881 case AMDGPU::EXEC_LO:
4882 case AMDGPU::EXEC_HI:
4883 case AMDGPU::FLAT_SCR_LO:
4884 case AMDGPU::FLAT_SCR_HI:
4889 case AMDGPU::FLAT_SCR:
4908 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4917static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4939 auto Next = std::next(
I);
4950 MBB.addSuccessor(LoopBB);
4952 return std::pair(LoopBB, RemainderBB);
4959 auto I =
MI.getIterator();
4960 auto E = std::next(
I);
4982 Src->setIsKill(
false);
4992 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
4998 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5001 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5025 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5026 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5036 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
5037 Register NewExec =
MRI.createVirtualRegister(BoolRC);
5039 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5040 Register CondReg =
MRI.createVirtualRegister(BoolRC);
5048 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5055 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5059 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5067 MRI.setSimpleHint(NewExec, CondReg);
5069 if (UseGPRIdxMode) {
5071 SGPRIdxReg = CurrentIdxReg;
5073 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5074 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5084 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5115 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5116 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5124 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5126 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
5127 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
5143 InitResultReg, DstReg, PhiReg, TmpExec,
5144 Offset, UseGPRIdxMode, SGPRIdxReg);
5150 LoopBB->removeSuccessor(RemainderBB);
5152 LoopBB->addSuccessor(LandingPad);
5163static std::pair<unsigned, int>
5167 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5172 return std::pair(AMDGPU::sub0,
Offset);
5212 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5229 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5230 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5239 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5242 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5246 if (UseGPRIdxMode) {
5253 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5266 MI.eraseFromParent();
5275 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5276 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5282 UseGPRIdxMode, SGPRIdxReg);
5286 if (UseGPRIdxMode) {
5288 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5290 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5295 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5300 MI.eraseFromParent();
5317 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5327 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5329 if (Idx->
getReg() == AMDGPU::NoRegister) {
5340 MI.eraseFromParent();
5345 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5349 if (UseGPRIdxMode) {
5353 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5362 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5363 TRI.getRegSizeInBits(*VecRC), 32,
false);
5369 MI.eraseFromParent();
5379 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5383 UseGPRIdxMode, SGPRIdxReg);
5386 if (UseGPRIdxMode) {
5388 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5390 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5396 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5397 TRI.getRegSizeInBits(*VecRC), 32,
false);
5398 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5404 MI.eraseFromParent();
5420 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5421 if (ST.hasScalarAddSub64()) {
5422 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5432 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5433 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5436 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5438 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5441 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5443 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5445 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5446 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5455 MI.eraseFromParent();
5461 case AMDGPU::S_MIN_U32:
5462 return std::numeric_limits<uint32_t>::max();
5463 case AMDGPU::S_MIN_I32:
5464 return std::numeric_limits<int32_t>::max();
5465 case AMDGPU::S_MAX_U32:
5466 return std::numeric_limits<uint32_t>::min();
5467 case AMDGPU::S_MAX_I32:
5468 return std::numeric_limits<int32_t>::min();
5469 case AMDGPU::S_ADD_I32:
5470 case AMDGPU::S_SUB_I32:
5471 case AMDGPU::S_OR_B32:
5472 case AMDGPU::S_XOR_B32:
5473 return std::numeric_limits<uint32_t>::min();
5474 case AMDGPU::S_AND_B32:
5475 return std::numeric_limits<uint32_t>::max();
5478 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5484 case AMDGPU::V_CMP_LT_U64_e64:
5485 return std::numeric_limits<uint64_t>::max();
5486 case AMDGPU::V_CMP_LT_I64_e64:
5487 return std::numeric_limits<int64_t>::max();
5488 case AMDGPU::V_CMP_GT_U64_e64:
5489 return std::numeric_limits<uint64_t>::min();
5490 case AMDGPU::V_CMP_GT_I64_e64:
5491 return std::numeric_limits<int64_t>::min();
5492 case AMDGPU::S_ADD_U64_PSEUDO:
5493 case AMDGPU::S_SUB_U64_PSEUDO:
5494 case AMDGPU::S_OR_B64:
5495 case AMDGPU::S_XOR_B64:
5496 return std::numeric_limits<uint64_t>::min();
5497 case AMDGPU::S_AND_B64:
5498 return std::numeric_limits<uint64_t>::max();
5501 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5506 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5507 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5508 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5509 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5510 Opc == AMDGPU::S_XOR_B32;
5524 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5529 case AMDGPU::S_MIN_U32:
5530 case AMDGPU::S_MIN_I32:
5531 case AMDGPU::S_MAX_U32:
5532 case AMDGPU::S_MAX_I32:
5533 case AMDGPU::S_AND_B32:
5534 case AMDGPU::S_OR_B32: {
5540 case AMDGPU::V_CMP_LT_U64_e64:
5541 case AMDGPU::V_CMP_LT_I64_e64:
5542 case AMDGPU::V_CMP_GT_U64_e64:
5543 case AMDGPU::V_CMP_GT_I64_e64:
5544 case AMDGPU::S_AND_B64:
5545 case AMDGPU::S_OR_B64: {
5551 case AMDGPU::S_XOR_B32:
5552 case AMDGPU::S_XOR_B64:
5553 case AMDGPU::S_ADD_I32:
5554 case AMDGPU::S_ADD_U64_PSEUDO:
5555 case AMDGPU::S_SUB_I32:
5556 case AMDGPU::S_SUB_U64_PSEUDO: {
5559 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5561 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5563 bool IsWave32 = ST.isWave32();
5564 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5565 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5566 unsigned BitCountOpc =
5567 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5571 auto NewAccumulator =
5576 case AMDGPU::S_XOR_B32:
5577 case AMDGPU::S_XOR_B64: {
5583 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5586 .
addReg(NewAccumulator->getOperand(0).getReg())
5589 if (
Opc == AMDGPU::S_XOR_B32) {
5595 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5597 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5601 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5604 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5606 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5616 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5624 case AMDGPU::S_SUB_I32: {
5625 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5633 .
addReg(NewAccumulator->getOperand(0).getReg());
5636 case AMDGPU::S_ADD_I32: {
5639 .
addReg(NewAccumulator->getOperand(0).getReg());
5642 case AMDGPU::S_ADD_U64_PSEUDO:
5643 case AMDGPU::S_SUB_U64_PSEUDO: {
5644 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5645 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5647 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5649 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5650 Register CarryReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5651 Register AddReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5653 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5655 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5659 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5662 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5664 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5666 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5669 .
addReg(NewAccumulator->getOperand(0).getReg())
5679 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5681 : NewAccumulator->getOperand(0).getReg();
5692 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5698 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5704 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5736 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5737 Register IdentityValReg =
MRI.createVirtualRegister(DstRegClass);
5738 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5739 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5740 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5741 Register FF1Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5742 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
5744 bool IsWave32 = ST.isWave32();
5745 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5746 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5753 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5757 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5766 I = ComputeLoop->begin();
5768 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5772 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5776 I = ComputeLoop->end();
5779 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5783 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5792 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5794 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5795 Register LaneValReg =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5798 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5800 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5802 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5804 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5808 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5812 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5813 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5819 case AMDGPU::S_OR_B64:
5820 case AMDGPU::S_AND_B64:
5821 case AMDGPU::S_XOR_B64: {
5824 .
addReg(LaneValue->getOperand(0).getReg())
5828 case AMDGPU::V_CMP_GT_I64_e64:
5829 case AMDGPU::V_CMP_GT_U64_e64:
5830 case AMDGPU::V_CMP_LT_I64_e64:
5831 case AMDGPU::V_CMP_LT_U64_e64: {
5832 Register LaneMaskReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5834 MRI.createVirtualRegister(WaveMaskRegClass);
5837 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5838 Register AccumulatorVReg =
MRI.createVirtualRegister(VregClass);
5841 VregClass, AMDGPU::sub0, VSubRegClass);
5844 VregClass, AMDGPU::sub1, VSubRegClass);
5845 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
5852 .
addReg(LaneValue->getOperand(0).getReg())
5853 .
addReg(AccumulatorVReg);
5855 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5856 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
5860 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5861 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5862 .
addReg(LaneValue->getOperand(0).getReg())
5866 case AMDGPU::S_ADD_U64_PSEUDO:
5867 case AMDGPU::S_SUB_U64_PSEUDO: {
5870 .
addReg(LaneValue->getOperand(0).getReg());
5877 unsigned BITSETOpc =
5878 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5879 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5885 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5888 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5890 .
addReg(NewActiveBitsReg)
5892 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5897 MI.eraseFromParent();
5912 switch (
MI.getOpcode()) {
5913 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5915 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5917 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5919 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5921 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5923 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5925 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5927 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5929 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5931 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5933 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5935 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5937 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5939 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5941 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5943 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5945 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5947 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5949 case AMDGPU::S_UADDO_PSEUDO:
5950 case AMDGPU::S_USUBO_PSEUDO: {
5956 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5958 : AMDGPU::S_SUB_U32;
5966 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5969 MI.eraseFromParent();
5972 case AMDGPU::S_ADD_U64_PSEUDO:
5973 case AMDGPU::S_SUB_U64_PSEUDO: {
5976 case AMDGPU::V_ADD_U64_PSEUDO:
5977 case AMDGPU::V_SUB_U64_PSEUDO: {
5978 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5984 if (ST.hasAddSubU64Insts()) {
5986 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5987 : AMDGPU::V_SUB_U64_e64),
5992 TII->legalizeOperands(*
I);
5993 MI.eraseFromParent();
5997 if (IsAdd && ST.hasLshlAddU64Inst()) {
6003 TII->legalizeOperands(*
Add);
6004 MI.eraseFromParent();
6008 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6010 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6011 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6013 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
6014 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
6018 : &AMDGPU::VReg_64RegClass;
6021 : &AMDGPU::VReg_64RegClass;
6024 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6026 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6029 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6031 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6034 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6036 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6039 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6046 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6060 TII->legalizeOperands(*LoHalf);
6061 TII->legalizeOperands(*HiHalf);
6062 MI.eraseFromParent();
6065 case AMDGPU::S_ADD_CO_PSEUDO:
6066 case AMDGPU::S_SUB_CO_PSEUDO: {
6077 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6078 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6083 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6084 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6088 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6090 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6095 if (ST.isWave64()) {
6096 if (ST.hasScalarCompareEq64()) {
6103 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6105 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6107 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6108 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6110 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6124 unsigned Opc =
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6125 ? AMDGPU::S_ADDC_U32
6126 : AMDGPU::S_SUBB_U32;
6131 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6137 MI.eraseFromParent();
6140 case AMDGPU::SI_INIT_M0: {
6143 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6146 MI.eraseFromParent();
6149 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6152 TII->get(AMDGPU::S_CMP_EQ_U32))
6157 case AMDGPU::GET_GROUPSTATICSIZE: {
6161 .
add(
MI.getOperand(0))
6163 MI.eraseFromParent();
6166 case AMDGPU::GET_SHADERCYCLESHILO: {
6179 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6181 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6182 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6184 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6185 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6187 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6191 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6196 .
add(
MI.getOperand(0))
6201 MI.eraseFromParent();
6204 case AMDGPU::SI_INDIRECT_SRC_V1:
6205 case AMDGPU::SI_INDIRECT_SRC_V2:
6206 case AMDGPU::SI_INDIRECT_SRC_V4:
6207 case AMDGPU::SI_INDIRECT_SRC_V8:
6208 case AMDGPU::SI_INDIRECT_SRC_V9:
6209 case AMDGPU::SI_INDIRECT_SRC_V10:
6210 case AMDGPU::SI_INDIRECT_SRC_V11:
6211 case AMDGPU::SI_INDIRECT_SRC_V12:
6212 case AMDGPU::SI_INDIRECT_SRC_V16:
6213 case AMDGPU::SI_INDIRECT_SRC_V32:
6215 case AMDGPU::SI_INDIRECT_DST_V1:
6216 case AMDGPU::SI_INDIRECT_DST_V2:
6217 case AMDGPU::SI_INDIRECT_DST_V4:
6218 case AMDGPU::SI_INDIRECT_DST_V8:
6219 case AMDGPU::SI_INDIRECT_DST_V9:
6220 case AMDGPU::SI_INDIRECT_DST_V10:
6221 case AMDGPU::SI_INDIRECT_DST_V11:
6222 case AMDGPU::SI_INDIRECT_DST_V12:
6223 case AMDGPU::SI_INDIRECT_DST_V16:
6224 case AMDGPU::SI_INDIRECT_DST_V32:
6226 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6227 case AMDGPU::SI_KILL_I1_PSEUDO:
6229 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6233 Register SrcCond =
MI.getOperand(3).getReg();
6235 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6236 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6237 const auto *CondRC =
TRI->getWaveMaskRegClass();
6238 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
6242 : &AMDGPU::VReg_64RegClass;
6245 : &AMDGPU::VReg_64RegClass;
6248 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6250 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6253 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6255 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6258 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6260 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6281 MI.eraseFromParent();
6284 case AMDGPU::SI_BR_UNDEF: {
6286 .
add(
MI.getOperand(0));
6288 MI.eraseFromParent();
6291 case AMDGPU::ADJCALLSTACKUP:
6292 case AMDGPU::ADJCALLSTACKDOWN: {
6299 case AMDGPU::SI_CALL_ISEL: {
6300 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
6303 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6309 MI.eraseFromParent();
6312 case AMDGPU::V_ADD_CO_U32_e32:
6313 case AMDGPU::V_SUB_CO_U32_e32:
6314 case AMDGPU::V_SUBREV_CO_U32_e32: {
6316 unsigned Opc =
MI.getOpcode();
6318 bool NeedClampOperand =
false;
6319 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
6321 NeedClampOperand =
true;
6325 if (
TII->isVOP3(*
I)) {
6328 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
6329 if (NeedClampOperand)
6332 TII->legalizeOperands(*
I);
6334 MI.eraseFromParent();
6337 case AMDGPU::V_ADDC_U32_e32:
6338 case AMDGPU::V_SUBB_U32_e32:
6339 case AMDGPU::V_SUBBREV_U32_e32:
6342 TII->legalizeOperands(
MI);
6344 case AMDGPU::DS_GWS_INIT:
6345 case AMDGPU::DS_GWS_SEMA_BR:
6346 case AMDGPU::DS_GWS_BARRIER:
6347 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
6349 case AMDGPU::DS_GWS_SEMA_V:
6350 case AMDGPU::DS_GWS_SEMA_P:
6351 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6359 case AMDGPU::S_SETREG_B32: {
6375 const unsigned SetMask = WidthMask <<
Offset;
6378 unsigned SetDenormOp = 0;
6379 unsigned SetRoundOp = 0;
6387 SetRoundOp = AMDGPU::S_ROUND_MODE;
6388 SetDenormOp = AMDGPU::S_DENORM_MODE;
6390 SetRoundOp = AMDGPU::S_ROUND_MODE;
6392 SetDenormOp = AMDGPU::S_DENORM_MODE;
6395 if (SetRoundOp || SetDenormOp) {
6397 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6398 unsigned ImmVal = Def->getOperand(1).getImm();
6412 MI.eraseFromParent();
6421 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6425 case AMDGPU::S_INVERSE_BALLOT_U32:
6426 case AMDGPU::S_INVERSE_BALLOT_U64:
6429 MI.setDesc(
TII->get(AMDGPU::COPY));
6431 case AMDGPU::ENDPGM_TRAP: {
6433 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6453 MI.eraseFromParent();
6456 case AMDGPU::SIMULATED_TRAP: {
6457 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6459 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6460 MI.eraseFromParent();
6463 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6464 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6470 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6471 Register OriginalExec = Setup->getOperand(0).getReg();
6473 MI.getOperand(0).setReg(OriginalExec);
6510 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6514 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6541 if (!Subtarget->hasMadMacF32Insts())
6542 return Subtarget->hasFastFMAF32();
6548 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6551 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6567 switch (Ty.getScalarSizeInBits()) {
6585 if (Ty.getScalarSizeInBits() == 16)
6587 if (Ty.getScalarSizeInBits() == 32)
6588 return Subtarget->hasMadMacF32Insts() &&
6598 EVT VT =
N->getValueType(0);
6600 return Subtarget->hasMadMacF32Insts() &&
6602 if (VT == MVT::f16) {
6603 return Subtarget->hasMadF16() &&
6618 unsigned Opc =
Op.getOpcode();
6619 EVT VT =
Op.getValueType();
6620 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6621 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6622 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6623 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6624 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6625 VT == MVT::v32bf16);
6641 [[maybe_unused]]
EVT VT =
Op.getValueType();
6643 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6644 VT == MVT::v16i32) &&
6645 "Unexpected ValueType.");
6654 unsigned Opc =
Op.getOpcode();
6655 EVT VT =
Op.getValueType();
6656 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6657 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6658 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6659 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6660 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6661 VT == MVT::v32bf16);
6669 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6671 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6678 unsigned Opc =
Op.getOpcode();
6679 EVT VT =
Op.getValueType();
6680 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6681 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6682 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6683 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6684 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6685 VT == MVT::v32bf16);
6690 : std::pair(Op0, Op0);
6699 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
6701 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
6707 switch (
Op.getOpcode()) {
6711 return LowerBRCOND(
Op, DAG);
6713 return LowerRETURNADDR(
Op, DAG);
6716 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6717 "Load should return a value and a chain");
6721 EVT VT =
Op.getValueType();
6723 return lowerFSQRTF32(
Op, DAG);
6725 return lowerFSQRTF64(
Op, DAG);
6730 return LowerTrig(
Op, DAG);
6732 return LowerSELECT(
Op, DAG);
6734 return LowerFDIV(
Op, DAG);
6736 return LowerFFREXP(
Op, DAG);
6737 case ISD::ATOMIC_CMP_SWAP:
6738 return LowerATOMIC_CMP_SWAP(
Op, DAG);
6740 return LowerSTORE(
Op, DAG);
6744 return LowerGlobalAddress(MFI,
Op, DAG);
6747 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
6749 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
6751 return LowerINTRINSIC_VOID(
Op, DAG);
6752 case ISD::ADDRSPACECAST:
6753 return lowerADDRSPACECAST(
Op, DAG);
6755 return lowerINSERT_SUBVECTOR(
Op, DAG);
6757 return lowerINSERT_VECTOR_ELT(
Op, DAG);
6759 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
6761 return lowerVECTOR_SHUFFLE(
Op, DAG);
6763 return lowerSCALAR_TO_VECTOR(
Op, DAG);
6765 return lowerBUILD_VECTOR(
Op, DAG);
6768 return lowerFP_ROUND(
Op, DAG);
6770 return lowerTRAP(
Op, DAG);
6771 case ISD::DEBUGTRAP:
6772 return lowerDEBUGTRAP(
Op, DAG);
6781 return lowerFMINNUM_FMAXNUM(
Op, DAG);
6782 case ISD::FMINIMUMNUM:
6783 case ISD::FMAXIMUMNUM:
6784 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
6787 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
6790 return lowerFLDEXP(
Op, DAG);
6807 case ISD::FMINNUM_IEEE:
6808 case ISD::FMAXNUM_IEEE:
6815 return lowerFCOPYSIGN(
Op, DAG);
6817 return lowerMUL(
Op, DAG);
6820 return lowerXMULO(
Op, DAG);
6823 return lowerXMUL_LOHI(
Op, DAG);
6824 case ISD::DYNAMIC_STACKALLOC:
6826 case ISD::STACKSAVE:
6830 case ISD::SET_ROUNDING:
6834 case ISD::FP_EXTEND:
6837 case ISD::GET_FPENV:
6839 case ISD::SET_FPENV:
6858 EVT FittingLoadVT = LoadVT;
6883 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6887 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6890SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6893 bool IsIntrinsic)
const {
6896 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6897 EVT LoadVT =
M->getValueType(0);
6899 EVT EquivLoadVT = LoadVT;
6913 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
6917 M->getMemoryVT(),
M->getMemOperand());
6928 EVT LoadVT =
M->getValueType(0);
6934 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
6935 bool IsTFE =
M->getNumValues() == 3;
6948 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
6952 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
6953 M->getMemOperand(), DAG);
6957 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
6959 M->getMemOperand(), DAG);
6967 EVT VT =
N->getValueType(0);
6968 unsigned CondCode =
N->getConstantOperandVal(3);
6979 EVT CmpVT =
LHS.getValueType();
6980 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6981 unsigned PromoteOp =
7001 EVT VT =
N->getValueType(0);
7003 unsigned CondCode =
N->getConstantOperandVal(3);
7012 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7013 Src0 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7014 Src1 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7030 EVT VT =
N->getValueType(0);
7037 Src.getOperand(1), Src.getOperand(2));
7048 Exec = AMDGPU::EXEC_LO;
7050 Exec = AMDGPU::EXEC;
7067 EVT VT =
N->getValueType(0);
7069 unsigned IID =
N->getConstantOperandVal(0);
7070 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7071 IID == Intrinsic::amdgcn_permlanex16;
7072 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7073 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7077 unsigned SplitSize = 32;
7078 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7079 ST->hasDPALU_DPP() &&
7087 case Intrinsic::amdgcn_permlane16:
7088 case Intrinsic::amdgcn_permlanex16:
7089 case Intrinsic::amdgcn_update_dpp:
7094 case Intrinsic::amdgcn_writelane:
7097 case Intrinsic::amdgcn_readlane:
7098 case Intrinsic::amdgcn_set_inactive:
7099 case Intrinsic::amdgcn_set_inactive_chain_arg:
7100 case Intrinsic::amdgcn_mov_dpp8:
7103 case Intrinsic::amdgcn_readfirstlane:
7104 case Intrinsic::amdgcn_permlane64:
7114 if (
SDNode *GL =
N->getGluedNode()) {
7115 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7116 GL = GL->getOperand(0).getNode();
7117 Operands.push_back(DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7126 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7127 IID == Intrinsic::amdgcn_mov_dpp8 ||
7128 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7129 Src1 =
N->getOperand(2);
7130 if (IID == Intrinsic::amdgcn_writelane ||
7131 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7132 Src2 =
N->getOperand(3);
7135 if (ValSize == SplitSize) {
7145 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7150 if (IID == Intrinsic::amdgcn_writelane) {
7155 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7157 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7160 if (ValSize % SplitSize != 0)
7164 EVT VT =
N->getValueType(0);
7168 unsigned NumOperands =
N->getNumOperands();
7170 SDNode *GL =
N->getGluedNode();
7175 for (
unsigned i = 0; i != NE; ++i) {
7176 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7178 SDValue Operand =
N->getOperand(j);
7193 DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7208 if (SplitSize == 32) {
7210 return unrollLaneOp(LaneOp.
getNode());
7216 unsigned SubVecNumElt =
7220 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7221 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7225 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7230 if (IID == Intrinsic::amdgcn_writelane)
7235 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7236 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7237 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7238 EltIdx += SubVecNumElt;
7252 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7255 if (IID == Intrinsic::amdgcn_writelane)
7258 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7266 switch (
N->getOpcode()) {
7278 unsigned IID =
N->getConstantOperandVal(0);
7280 case Intrinsic::amdgcn_make_buffer_rsrc:
7281 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
7283 case Intrinsic::amdgcn_cvt_pkrtz: {
7289 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7292 case Intrinsic::amdgcn_cvt_pknorm_i16:
7293 case Intrinsic::amdgcn_cvt_pknorm_u16:
7294 case Intrinsic::amdgcn_cvt_pk_i16:
7295 case Intrinsic::amdgcn_cvt_pk_u16: {
7301 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7303 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7305 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7310 EVT VT =
N->getValueType(0);
7315 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7319 case Intrinsic::amdgcn_s_buffer_load: {
7325 if (!Subtarget->hasScalarSubwordLoads())
7331 EVT VT =
Op.getValueType();
7332 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
7344 if (!
Offset->isDivergent()) {
7363 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
7368 case Intrinsic::amdgcn_dead: {
7369 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7380 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7381 Results.push_back(Res.getOperand(
I));
7385 Results.push_back(Res.getValue(1));
7394 EVT VT =
N->getValueType(0);
7399 EVT SelectVT = NewVT;
7400 if (NewVT.
bitsLT(MVT::i32)) {
7403 SelectVT = MVT::i32;
7409 if (NewVT != SelectVT)
7415 if (
N->getValueType(0) != MVT::v2f16)
7419 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7427 if (
N->getValueType(0) != MVT::v2f16)
7431 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7439 if (
N->getValueType(0) != MVT::f16)
7454 if (U.get() !=
Value)
7457 if (U.getUser()->getOpcode() == Opcode)
7463unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
7466 case Intrinsic::amdgcn_if:
7468 case Intrinsic::amdgcn_else:
7470 case Intrinsic::amdgcn_loop:
7472 case Intrinsic::amdgcn_end_cf:
7492 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7519 SDNode *Intr = BRCOND.getOperand(1).getNode();
7536 Intr =
LHS.getNode();
7544 assert(BR &&
"brcond missing unconditional branch user");
7549 unsigned CFNode = isCFIntrinsic(Intr);
7569 Ops.push_back(Target);
7592 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
7611 MVT VT =
Op.getSimpleValueType();
7614 if (
Op.getConstantOperandVal(0) != 0)
7618 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7620 if (
Info->isEntryFunction())
7637 return Op.getValueType().bitsLE(VT)
7645 EVT DstVT =
Op.getValueType();
7652 unsigned Opc =
Op.getOpcode();
7664 EVT SrcVT = Src.getValueType();
7665 EVT DstVT =
Op.getValueType();
7668 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
7671 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
7678 if (DstVT == MVT::f16) {
7683 if (!Subtarget->has16BitInsts()) {
7686 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7688 if (
Op->getFlags().hasApproximateFuncs()) {
7695 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7699 "custom lower FP_ROUND for f16 or bf16");
7700 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
7713 EVT VT =
Op.getValueType();
7715 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7716 bool IsIEEEMode =
Info->getMode().IEEE;
7725 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7732SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
7734 EVT VT =
Op.getValueType();
7736 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7737 bool IsIEEEMode =
Info->getMode().IEEE;
7742 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7750 EVT VT =
Op.getValueType();
7754 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7755 !Subtarget->hasMinimum3Maximum3F16() &&
7756 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7757 "should not need to widen f16 minimum/maximum to v2f16");
7771 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7779 EVT VT =
Op.getValueType();
7783 EVT ExpVT =
Exp.getValueType();
7784 if (ExpVT == MVT::i16)
7805 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
7808 return DAG.
getNode(ISD::FLDEXP,
DL, VT,
Op.getOperand(0), TruncExp);
7812 switch (
Op->getOpcode()) {
7842 DAGCombinerInfo &DCI)
const {
7843 const unsigned Opc =
Op.getOpcode();
7851 :
Op->getOperand(0).getValueType();
7854 if (DCI.isBeforeLegalizeOps() ||
7858 auto &DAG = DCI.DAG;
7864 LHS =
Op->getOperand(1);
7865 RHS =
Op->getOperand(2);
7867 LHS =
Op->getOperand(0);
7868 RHS =
Op->getOperand(1);
7907 if (MagVT == SignVT)
7914 SDValue SignAsInt32 = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7917 SDValue SignAsHalf16 = DAG.
getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7924 EVT VT =
Op.getValueType();
7930 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
7957 if (
Op->isDivergent())
7970 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7972 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7975 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7977 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7983 EVT VT =
Op.getValueType();
7990 const APInt &
C = RHSC->getAPIntValue();
7992 if (
C.isPowerOf2()) {
7994 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
8021 if (
Op->isDivergent()) {
8025 if (Subtarget->hasSMulHi()) {
8036 if (!Subtarget->isTrapHandlerEnabled() ||
8038 return lowerTrapEndpgm(
Op, DAG);
8040 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
8041 : lowerTrapHsaQueuePtr(
Op, DAG);
8051SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
8053 ImplicitParameter Param)
const {
8073 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
8076 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8079 if (UserSGPR == AMDGPU::NoRegister) {
8105 if (Subtarget->hasPrivEnabledTrap2NopBug())
8118 if (!Subtarget->isTrapHandlerEnabled() ||
8122 "debugtrap handler not supported",
8133SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
8135 if (Subtarget->hasApertureRegs()) {
8137 ? AMDGPU::SRC_SHARED_BASE
8138 : AMDGPU::SRC_PRIVATE_BASE;
8139 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8140 !Subtarget->hasGloballyAddressableScratch()) &&
8141 "Cannot use src_private_base with globally addressable scratch!");
8162 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
8166 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8168 if (UserSGPR == AMDGPU::NoRegister) {
8202 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8213 const AMDGPUTargetMachine &TM =
8216 unsigned DestAS, SrcAS;
8218 bool IsNonNull =
false;
8220 SrcAS = ASC->getSrcAddressSpace();
8221 Src = ASC->getOperand(0);
8222 DestAS = ASC->getDestAddressSpace();
8225 Op.getConstantOperandVal(0) ==
8226 Intrinsic::amdgcn_addrspacecast_nonnull);
8227 Src =
Op->getOperand(1);
8228 SrcAS =
Op->getConstantOperandVal(2);
8229 DestAS =
Op->getConstantOperandVal(3);
8242 Subtarget->hasGloballyAddressableScratch()) {
8247 AMDGPU::S_MOV_B32, SL, MVT::i32,
8248 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8256 unsigned NullVal = TM.getNullPointerValue(DestAS);
8271 Subtarget->hasGloballyAddressableScratch()) {
8280 if (Subtarget->isWave64())
8286 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8289 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8294 AMDGPU::S_MOV_B64, SL, MVT::i64,
8295 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8297 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8299 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8301 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8307 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8319 Op.getValueType() == MVT::i64) {
8320 const SIMachineFunctionInfo *
Info =
8324 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8328 Src.getValueType() == MVT::i64)
8348 EVT InsVT =
Ins.getValueType();
8356 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
8361 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8363 MVT::i32, InsNumElts / 2);
8365 Vec = DAG.
getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8366 Ins = DAG.
getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8368 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
8370 if (InsNumElts == 2) {
8380 return DAG.
getNode(ISD::BITCAST, SL, VecVT, Vec);
8383 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
8406 if (NumElts == 4 && EltSize == 16 && KIdx) {
8414 SDValue LoVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8415 SDValue HiVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8417 unsigned Idx = KIdx->getZExtValue();
8418 bool InsertLo = Idx < 2;
8421 DAG.
getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8422 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8424 InsHalf = DAG.
getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8428 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8441 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
8469 return DAG.
getNode(ISD::BITCAST, SL, VecVT, BFI);
8476 EVT ResultVT =
Op.getValueType();
8489 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8492 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8496 if (VecSize == 128) {
8504 }
else if (VecSize == 256) {
8507 for (
unsigned P = 0;
P < 4; ++
P) {
8513 Parts[0], Parts[1]));
8515 Parts[2], Parts[3]));
8521 for (
unsigned P = 0;
P < 8; ++
P) {
8528 Parts[0], Parts[1], Parts[2], Parts[3]));
8531 Parts[4], Parts[5], Parts[6], Parts[7]));
8551 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8566 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8568 return DAG.
getNode(ISD::BITCAST, SL, ResultVT, Result);
8576 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8581 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8582 !(Mask[Elt + 1] & 1);
8588 EVT ResultVT =
Op.getValueType();
8591 const int NewSrcNumElts = 2;
8593 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
8609 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
8631 if (ShouldUseConsecutiveExtract &&
8634 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8635 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8647 if (Idx0 >= SrcNumElts) {
8652 if (Idx1 >= SrcNumElts) {
8657 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8658 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8666 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8667 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8672 if (SubVec0 != SubVec1) {
8673 NewMaskIdx1 += NewSrcNumElts;
8680 {NewMaskIdx0, NewMaskIdx1});
8685 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8686 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8687 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8688 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8707 EVT ResultVT =
Op.getValueType();
8723 EVT VT =
Op.getValueType();
8725 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8726 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
8735 return DAG.
getNode(ISD::BITCAST, SL, VT, ExtLo);
8744 return DAG.
getNode(ISD::BITCAST, SL, VT, ShlHi);
8751 return DAG.
getNode(ISD::BITCAST, SL, VT,
Or);
8760 for (
unsigned P = 0;
P < NumParts; ++
P) {
8762 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
8768 return DAG.
getNode(ISD::BITCAST, SL, VT, Blend);
8781 if (!Subtarget->isAmdHsaOS())
8841 EVT PtrVT =
Op.getValueType();
8843 const GlobalValue *GV = GSD->
getGlobal();
8857 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
8875 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8876 if (Subtarget->has64BitLiterals()) {
8907 MachinePointerInfo PtrInfo =
8935 SDValue Param = lowerKernargMemParameter(
8946 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
8954 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
8962 unsigned NumElts = Elts.
size();
8964 if (NumElts <= 12) {
8973 for (
unsigned i = 0; i < Elts.
size(); ++i) {
8979 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
8989 EVT SrcVT = Src.getValueType();
9010 bool Unpacked,
bool IsD16,
int DMaskPop,
9011 int NumVDataDwords,
bool IsAtomicPacked16Bit,
9015 EVT ReqRetVT = ResultTypes[0];
9017 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9018 ? (ReqRetNumElts + 1) / 2
9021 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9032 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
9043 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
9045 NumDataDwords - MaskPopDwords);
9050 EVT LegalReqRetVT = ReqRetVT;
9052 if (!
Data.getValueType().isInteger())
9054 Data.getValueType().changeTypeToInteger(),
Data);
9075 if (Result->getNumValues() == 1)
9082 SDValue *LWE,
bool &IsTexFail) {
9102 unsigned DimIdx,
unsigned EndIdx,
9103 unsigned NumGradients) {
9105 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
9113 if (((
I + 1) >= EndIdx) ||
9114 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
9115 I == DimIdx + NumGradients - 1))) {
9134 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9148 int NumVDataDwords = 0;
9149 bool AdjustRetType =
false;
9150 bool IsAtomicPacked16Bit =
false;
9153 const unsigned ArgOffset = WithChain ? 2 : 1;
9156 unsigned DMaskLanes = 0;
9158 if (BaseOpcode->Atomic) {
9159 VData =
Op.getOperand(2);
9161 IsAtomicPacked16Bit =
9162 (Intr->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9163 Intr->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
9166 if (BaseOpcode->AtomicX2) {
9173 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9174 DMask = Is64Bit ? 0xf : 0x3;
9175 NumVDataDwords = Is64Bit ? 4 : 2;
9177 DMask = Is64Bit ? 0x3 : 0x1;
9178 NumVDataDwords = Is64Bit ? 2 : 1;
9181 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
9184 if (BaseOpcode->Store) {
9185 VData =
Op.getOperand(2);
9189 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9193 VData = handleD16VData(VData, DAG,
true);
9196 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
9197 }
else if (!BaseOpcode->NoReturn) {
9202 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9210 (!LoadVT.
isVector() && DMaskLanes > 1))
9216 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9217 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9218 NumVDataDwords = (DMaskLanes + 1) / 2;
9220 NumVDataDwords = DMaskLanes;
9222 AdjustRetType =
true;
9226 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
9233 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9234 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9236 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
9238 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9239 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9243 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
9249 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
9253 "Bias needs to be converted to 16 bit in A16 mode");
9258 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
9262 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
9263 "require 16 bit args for both gradients and addresses");
9268 if (!
ST->hasA16()) {
9269 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
9270 "support 16 bit addresses\n");
9280 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
9282 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9284 IntrOpcode = G16MappingInfo->
G16;
9307 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
9325 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
9326 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
9327 const bool UseNSA =
ST->hasNSAEncoding() &&
9328 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
9329 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
9330 const bool UsePartialNSA =
9331 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
9334 if (UsePartialNSA) {
9336 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9337 }
else if (!UseNSA) {
9344 if (!BaseOpcode->Sampler) {
9347 uint64_t UnormConst =
9348 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
9350 Unorm = UnormConst ? True : False;
9356 bool IsTexFail =
false;
9357 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9368 NumVDataDwords += 1;
9369 AdjustRetType =
true;
9374 if (AdjustRetType) {
9377 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9386 MVT::i32, NumVDataDwords)
9389 ResultTypes[0] = NewVT;
9390 if (ResultTypes.size() == 3) {
9394 ResultTypes.erase(&ResultTypes[1]);
9399 if (BaseOpcode->Atomic)
9406 if (BaseOpcode->Store || BaseOpcode->Atomic)
9407 Ops.push_back(VData);
9408 if (UsePartialNSA) {
9410 Ops.push_back(VAddr);
9414 Ops.push_back(VAddr);
9417 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9419 Ops.push_back(Rsrc);
9420 if (BaseOpcode->Sampler) {
9424 Ops.push_back(Samp);
9429 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9430 Ops.push_back(Unorm);
9432 Ops.push_back(IsA16 &&
9433 ST->hasFeature(AMDGPU::FeatureR128A16)
9437 Ops.push_back(IsA16 ? True : False);
9439 if (!Subtarget->hasGFX90AInsts())
9444 "TFE is not supported on this GPU",
DL.getDebugLoc()));
9447 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9450 Ops.push_back(DimInfo->
DA ? True : False);
9451 if (BaseOpcode->HasD16)
9452 Ops.push_back(IsD16 ? True : False);
9454 Ops.push_back(
Op.getOperand(0));
9456 int NumVAddrDwords =
9462 NumVDataDwords, NumVAddrDwords);
9463 }
else if (IsGFX11Plus) {
9465 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9466 : AMDGPU::MIMGEncGfx11Default,
9467 NumVDataDwords, NumVAddrDwords);
9468 }
else if (IsGFX10Plus) {
9470 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9471 : AMDGPU::MIMGEncGfx10Default,
9472 NumVDataDwords, NumVAddrDwords);
9474 if (Subtarget->hasGFX90AInsts()) {
9476 NumVDataDwords, NumVAddrDwords);
9480 "requested image instruction is not supported on this GPU",
9485 for (EVT VT : OrigResultTypes) {
9486 if (VT == MVT::Other)
9487 RetValues[Idx++] =
Op.getOperand(0);
9498 NumVDataDwords, NumVAddrDwords);
9501 NumVDataDwords, NumVAddrDwords);
9508 MachineMemOperand *MemRef = MemOp->getMemOperand();
9512 if (BaseOpcode->AtomicX2) {
9517 if (BaseOpcode->NoReturn)
9520 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9521 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9534 MachinePointerInfo(),
9539 if (!
Offset->isDivergent()) {
9546 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9555 !Subtarget->hasScalarDwordx3Loads()) {
9582 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9584 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
9588 unsigned NumLoads = 1;
9594 if (NumElts == 8 || NumElts == 16) {
9595 NumLoads = NumElts / 4;
9599 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
9604 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
9606 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
9607 for (
unsigned i = 0; i < NumLoads; ++i) {
9613 if (NumElts == 8 || NumElts == 16)
9621 if (!Subtarget->hasArchitectedSGPRs())
9633 unsigned Width)
const {
9635 using namespace AMDGPU::Hwreg;
9637 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9676 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
9678 EVT VT =
Op.getValueType();
9680 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
9684 switch (IntrinsicID) {
9685 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9688 return getPreloadedValue(DAG, *MFI, VT,
9691 case Intrinsic::amdgcn_dispatch_ptr:
9692 case Intrinsic::amdgcn_queue_ptr: {
9693 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
9695 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
9700 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9703 return getPreloadedValue(DAG, *MFI, VT, RegID);
9705 case Intrinsic::amdgcn_implicitarg_ptr: {
9707 return getImplicitArgPtr(DAG,
DL);
9708 return getPreloadedValue(DAG, *MFI, VT,
9711 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9717 return getPreloadedValue(DAG, *MFI, VT,
9720 case Intrinsic::amdgcn_dispatch_id: {
9723 case Intrinsic::amdgcn_rcp:
9725 case Intrinsic::amdgcn_rsq:
9727 case Intrinsic::amdgcn_rsq_legacy:
9731 case Intrinsic::amdgcn_rcp_legacy:
9735 case Intrinsic::amdgcn_rsq_clamp: {
9746 return DAG.
getNode(ISD::FMAXNUM,
DL, VT, Tmp,
9749 case Intrinsic::r600_read_ngroups_x:
9750 if (Subtarget->isAmdHsaOS())
9753 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9756 case Intrinsic::r600_read_ngroups_y:
9757 if (Subtarget->isAmdHsaOS())
9760 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9763 case Intrinsic::r600_read_ngroups_z:
9764 if (Subtarget->isAmdHsaOS())
9767 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9770 case Intrinsic::r600_read_local_size_x:
9771 if (Subtarget->isAmdHsaOS())
9774 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9776 case Intrinsic::r600_read_local_size_y:
9777 if (Subtarget->isAmdHsaOS())
9780 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9782 case Intrinsic::r600_read_local_size_z:
9783 if (Subtarget->isAmdHsaOS())
9786 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9788 case Intrinsic::amdgcn_workgroup_id_x:
9789 return lowerWorkGroupId(DAG, *MFI, VT,
9793 case Intrinsic::amdgcn_workgroup_id_y:
9794 return lowerWorkGroupId(DAG, *MFI, VT,
9798 case Intrinsic::amdgcn_workgroup_id_z:
9799 return lowerWorkGroupId(DAG, *MFI, VT,
9803 case Intrinsic::amdgcn_cluster_id_x:
9804 return Subtarget->hasClusters()
9805 ? getPreloadedValue(DAG, *MFI, VT,
9807 : DAG.getPOISON(VT);
9808 case Intrinsic::amdgcn_cluster_id_y:
9809 return Subtarget->hasClusters()
9810 ? getPreloadedValue(DAG, *MFI, VT,
9813 case Intrinsic::amdgcn_cluster_id_z:
9814 return Subtarget->hasClusters()
9815 ? getPreloadedValue(DAG, *MFI, VT,
9818 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9819 return Subtarget->hasClusters()
9820 ? getPreloadedValue(
9824 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9825 return Subtarget->hasClusters()
9826 ? getPreloadedValue(
9830 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9831 return Subtarget->hasClusters()
9832 ? getPreloadedValue(
9836 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9837 return Subtarget->hasClusters()
9840 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9841 return Subtarget->hasClusters()
9842 ? getPreloadedValue(
9846 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9847 return Subtarget->hasClusters()
9848 ? getPreloadedValue(
9852 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9853 return Subtarget->hasClusters()
9854 ? getPreloadedValue(
9858 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9859 return Subtarget->hasClusters()
9860 ? getPreloadedValue(
9864 case Intrinsic::amdgcn_wave_id:
9865 return lowerWaveID(DAG,
Op);
9866 case Intrinsic::amdgcn_lds_kernel_id: {
9868 return getLDSKernelId(DAG,
DL);
9869 return getPreloadedValue(DAG, *MFI, VT,
9872 case Intrinsic::amdgcn_workitem_id_x:
9873 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
9874 case Intrinsic::amdgcn_workitem_id_y:
9875 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
9876 case Intrinsic::amdgcn_workitem_id_z:
9877 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
9878 case Intrinsic::amdgcn_wavefrontsize:
9880 SDLoc(
Op), MVT::i32);
9881 case Intrinsic::amdgcn_s_buffer_load: {
9882 unsigned CPol =
Op.getConstantOperandVal(3);
9889 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
9890 Op.getOperand(3), DAG);
9892 case Intrinsic::amdgcn_fdiv_fast:
9893 return lowerFDIV_FAST(
Op, DAG);
9894 case Intrinsic::amdgcn_sin:
9897 case Intrinsic::amdgcn_cos:
9900 case Intrinsic::amdgcn_mul_u24:
9903 case Intrinsic::amdgcn_mul_i24:
9907 case Intrinsic::amdgcn_log_clamp: {
9913 case Intrinsic::amdgcn_fract:
9916 case Intrinsic::amdgcn_class:
9919 case Intrinsic::amdgcn_div_fmas:
9921 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9923 case Intrinsic::amdgcn_div_fixup:
9925 Op.getOperand(2),
Op.getOperand(3));
9927 case Intrinsic::amdgcn_div_scale: {
9940 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
9943 Denominator, Numerator);
9945 case Intrinsic::amdgcn_icmp: {
9947 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
9948 Op.getConstantOperandVal(2) == 0 &&
9953 case Intrinsic::amdgcn_fcmp: {
9956 case Intrinsic::amdgcn_ballot:
9958 case Intrinsic::amdgcn_fmed3:
9960 Op.getOperand(2),
Op.getOperand(3));
9961 case Intrinsic::amdgcn_fdot2:
9963 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9964 case Intrinsic::amdgcn_fmul_legacy:
9967 case Intrinsic::amdgcn_sffbh:
9969 case Intrinsic::amdgcn_sbfe:
9971 Op.getOperand(2),
Op.getOperand(3));
9972 case Intrinsic::amdgcn_ubfe:
9974 Op.getOperand(2),
Op.getOperand(3));
9975 case Intrinsic::amdgcn_cvt_pkrtz:
9976 case Intrinsic::amdgcn_cvt_pknorm_i16:
9977 case Intrinsic::amdgcn_cvt_pknorm_u16:
9978 case Intrinsic::amdgcn_cvt_pk_i16:
9979 case Intrinsic::amdgcn_cvt_pk_u16: {
9981 EVT VT =
Op.getValueType();
9984 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9986 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9988 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9990 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
9996 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
9999 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
10000 return DAG.
getNode(ISD::BITCAST,
DL, VT, Node);
10002 case Intrinsic::amdgcn_fmad_ftz:
10004 Op.getOperand(2),
Op.getOperand(3));
10006 case Intrinsic::amdgcn_if_break:
10008 Op->getOperand(1),
Op->getOperand(2)),
10011 case Intrinsic::amdgcn_groupstaticsize: {
10017 const GlobalValue *GV =
10023 case Intrinsic::amdgcn_is_shared:
10024 case Intrinsic::amdgcn_is_private: {
10027 DAG.
getNode(ISD::BITCAST,
DL, MVT::v2i32,
Op.getOperand(1));
10031 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10035 Subtarget->hasGloballyAddressableScratch()) {
10038 AMDGPU::S_MOV_B32,
DL, MVT::i32,
10039 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10048 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10051 case Intrinsic::amdgcn_perm:
10053 Op.getOperand(2),
Op.getOperand(3));
10054 case Intrinsic::amdgcn_reloc_constant: {
10064 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10065 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10066 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10067 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10068 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10069 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10070 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10071 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10072 if (
Op.getOperand(4).getValueType() == MVT::i32)
10078 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10079 Op.getOperand(3), IndexKeyi32);
10081 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10082 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10083 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10084 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10085 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10086 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10087 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10088 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10089 if (
Op.getOperand(4).getValueType() == MVT::i64)
10095 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10096 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10097 Op.getOperand(6)});
10099 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10100 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10101 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10102 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10103 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10104 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10105 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10108 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
10114 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10115 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10116 IndexKey, Op.getOperand(7),
10117 Op.getOperand(8)});
10119 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10120 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10121 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10122 if (
Op.getOperand(6).getValueType() == MVT::i32)
10128 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10129 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10130 IndexKeyi32, Op.getOperand(7)});
10132 case Intrinsic::amdgcn_addrspacecast_nonnull:
10133 return lowerADDRSPACECAST(
Op, DAG);
10134 case Intrinsic::amdgcn_readlane:
10135 case Intrinsic::amdgcn_readfirstlane:
10136 case Intrinsic::amdgcn_writelane:
10137 case Intrinsic::amdgcn_permlane16:
10138 case Intrinsic::amdgcn_permlanex16:
10139 case Intrinsic::amdgcn_permlane64:
10140 case Intrinsic::amdgcn_set_inactive:
10141 case Intrinsic::amdgcn_set_inactive_chain_arg:
10142 case Intrinsic::amdgcn_mov_dpp8:
10143 case Intrinsic::amdgcn_update_dpp:
10145 case Intrinsic::amdgcn_dead: {
10147 for (
const EVT ValTy :
Op.getNode()->values())
10152 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10154 return lowerImage(
Op, ImageDimIntr, DAG,
false);
10165 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10171 unsigned NewOpcode)
const {
10175 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10176 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10194 M->getMemOperand());
10199 unsigned NewOpcode)
const {
10203 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10204 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10222 M->getMemOperand());
10227 unsigned IntrID =
Op.getConstantOperandVal(1);
10231 case Intrinsic::amdgcn_ds_ordered_add:
10232 case Intrinsic::amdgcn_ds_ordered_swap: {
10237 unsigned IndexOperand =
M->getConstantOperandVal(7);
10238 unsigned WaveRelease =
M->getConstantOperandVal(8);
10239 unsigned WaveDone =
M->getConstantOperandVal(9);
10241 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10242 IndexOperand &= ~0x3f;
10243 unsigned CountDw = 0;
10246 CountDw = (IndexOperand >> 24) & 0xf;
10247 IndexOperand &= ~(0xf << 24);
10249 if (CountDw < 1 || CountDw > 4) {
10252 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
10253 DL.getDebugLoc()));
10258 if (IndexOperand) {
10261 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
10264 if (WaveDone && !WaveRelease) {
10268 Fn,
"ds_ordered_count: wave_done requires wave_release",
10269 DL.getDebugLoc()));
10272 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10273 unsigned ShaderType =
10275 unsigned Offset0 = OrderedCountIndex << 2;
10276 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10279 Offset1 |= (CountDw - 1) << 6;
10282 Offset1 |= ShaderType << 2;
10284 unsigned Offset = Offset0 | (Offset1 << 8);
10291 M->getVTList(),
Ops,
M->getMemoryVT(),
10292 M->getMemOperand());
10294 case Intrinsic::amdgcn_raw_buffer_load:
10295 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10296 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10297 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10298 case Intrinsic::amdgcn_raw_buffer_load_format:
10299 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10300 const bool IsFormat =
10301 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10302 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10304 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10305 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10319 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
10321 case Intrinsic::amdgcn_struct_buffer_load:
10322 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10323 case Intrinsic::amdgcn_struct_buffer_load_format:
10324 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10325 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10326 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10327 const bool IsFormat =
10328 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10329 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10331 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10332 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10347 case Intrinsic::amdgcn_raw_tbuffer_load:
10348 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10350 EVT LoadVT =
Op.getValueType();
10351 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10352 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10371 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10374 case Intrinsic::amdgcn_struct_tbuffer_load:
10375 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10377 EVT LoadVT =
Op.getValueType();
10378 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10379 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10398 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10401 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10402 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10404 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10405 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10406 return lowerStructBufferAtomicIntrin(
Op, DAG,
10408 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10409 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10411 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10412 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10413 return lowerStructBufferAtomicIntrin(
Op, DAG,
10415 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10416 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10418 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10419 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10420 return lowerStructBufferAtomicIntrin(
Op, DAG,
10422 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10423 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10425 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10426 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10428 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10429 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10431 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10432 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10434 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10435 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10437 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10438 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10440 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10441 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10443 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10444 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10446 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10447 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10449 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10450 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10452 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10453 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10455 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10456 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10458 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10459 return lowerRawBufferAtomicIntrin(
Op, DAG,
10461 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10462 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10463 return lowerStructBufferAtomicIntrin(
Op, DAG,
10465 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10466 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10468 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10469 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10471 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10472 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10473 return lowerStructBufferAtomicIntrin(
Op, DAG,
10475 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10476 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10477 return lowerStructBufferAtomicIntrin(
Op, DAG,
10479 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10480 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10481 return lowerStructBufferAtomicIntrin(
Op, DAG,
10483 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10484 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10485 return lowerStructBufferAtomicIntrin(
Op, DAG,
10487 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10488 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10490 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10491 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10493 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10494 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10496 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10497 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10499 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10500 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10502 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10503 return lowerStructBufferAtomicIntrin(
Op, DAG,
10506 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10507 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10508 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
10509 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10523 EVT VT =
Op.getValueType();
10527 Op->getVTList(),
Ops, VT,
10528 M->getMemOperand());
10530 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10531 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10532 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
10533 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
10547 EVT VT =
Op.getValueType();
10551 Op->getVTList(),
Ops, VT,
10552 M->getMemOperand());
10554 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10555 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10557 SDValue NodePtr =
M->getOperand(2);
10558 SDValue RayExtent =
M->getOperand(3);
10559 SDValue InstanceMask =
M->getOperand(4);
10560 SDValue RayOrigin =
M->getOperand(5);
10561 SDValue RayDir =
M->getOperand(6);
10563 SDValue TDescr =
M->getOperand(8);
10568 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10573 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10574 const unsigned NumVDataDwords = 10;
10575 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10577 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10578 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10579 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10583 Ops.push_back(NodePtr);
10586 {DAG.getBitcast(MVT::i32, RayExtent),
10587 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10588 Ops.push_back(RayOrigin);
10589 Ops.push_back(RayDir);
10590 Ops.push_back(Offsets);
10591 Ops.push_back(TDescr);
10592 Ops.push_back(
M->getChain());
10595 MachineMemOperand *MemRef =
M->getMemOperand();
10599 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10601 SDValue NodePtr =
M->getOperand(2);
10602 SDValue RayExtent =
M->getOperand(3);
10603 SDValue RayOrigin =
M->getOperand(4);
10604 SDValue RayDir =
M->getOperand(5);
10605 SDValue RayInvDir =
M->getOperand(6);
10606 SDValue TDescr =
M->getOperand(7);
10613 if (!Subtarget->hasGFX10_AEncoding()) {
10623 const unsigned NumVDataDwords = 4;
10624 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10625 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10626 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10629 const unsigned BaseOpcodes[2][2] = {
10630 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10631 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10632 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10636 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10637 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10638 : AMDGPU::MIMGEncGfx10NSA,
10639 NumVDataDwords, NumVAddrDwords);
10643 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10644 : AMDGPU::MIMGEncGfx10Default,
10645 NumVDataDwords, NumVAddrDwords);
10651 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
10654 if (Lanes[0].getValueSizeInBits() == 32) {
10655 for (
unsigned I = 0;
I < 3; ++
I)
10662 Ops.push_back(Lanes[2]);
10674 if (UseNSA && IsGFX11Plus) {
10675 Ops.push_back(NodePtr);
10677 Ops.push_back(RayOrigin);
10682 for (
unsigned I = 0;
I < 3; ++
I) {
10685 {DirLanes[I], InvDirLanes[I]})));
10689 Ops.push_back(RayDir);
10690 Ops.push_back(RayInvDir);
10697 Ops.push_back(NodePtr);
10700 packLanes(RayOrigin,
true);
10701 packLanes(RayDir,
true);
10702 packLanes(RayInvDir,
false);
10707 if (NumVAddrDwords > 12) {
10709 Ops.append(16 -
Ops.size(), Undef);
10715 Ops.push_back(MergedOps);
10718 Ops.push_back(TDescr);
10720 Ops.push_back(
M->getChain());
10723 MachineMemOperand *MemRef =
M->getMemOperand();
10727 case Intrinsic::amdgcn_global_atomic_fmin_num:
10728 case Intrinsic::amdgcn_global_atomic_fmax_num:
10729 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10730 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10737 unsigned Opcode = 0;
10739 case Intrinsic::amdgcn_global_atomic_fmin_num:
10740 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10741 Opcode = ISD::ATOMIC_LOAD_FMIN;
10744 case Intrinsic::amdgcn_global_atomic_fmax_num:
10745 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10746 Opcode = ISD::ATOMIC_LOAD_FMAX;
10752 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
10753 Ops,
M->getMemOperand());
10755 case Intrinsic::amdgcn_s_get_barrier_state:
10756 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10763 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10764 BarID = (BarID >> 4) & 0x3F;
10765 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10768 Ops.push_back(Chain);
10770 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10771 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10779 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
10787 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10788 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10789 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10793 EVT VT =
Op->getValueType(0);
10799 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10801 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10809SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
10816 EVT VT = VTList.
VTs[0];
10819 bool IsTFE = VTList.
NumVTs == 3;
10822 unsigned NumOpDWords = NumValueDWords + 1;
10824 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
10825 MachineMemOperand *OpDWordsMMO =
10827 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
10828 OpDWordsVT, OpDWordsMMO, DAG);
10833 NumValueDWords == 1
10842 if (!Subtarget->hasDwordx3LoadStores() &&
10843 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10847 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
10849 WidenedMemVT, WidenedMMO);
10859 bool ImageStore)
const {
10869 if (Subtarget->hasUnpackedD16VMem()) {
10883 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10894 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
10900 if ((NumElements % 2) == 1) {
10902 unsigned I = Elts.
size() / 2;
10918 if (NumElements == 3) {
10928 return DAG.
getNode(ISD::BITCAST,
DL, WidenedStoreVT, ZExt);
10939 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
10942 switch (IntrinsicID) {
10943 case Intrinsic::amdgcn_exp_compr: {
10944 if (!Subtarget->hasCompressedExport()) {
10947 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
10959 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src0),
10960 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src1),
10969 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10973 case Intrinsic::amdgcn_struct_tbuffer_store:
10974 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10976 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
10978 VData = handleD16VData(VData, DAG);
10979 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10980 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10998 M->getMemoryVT(),
M->getMemOperand());
11001 case Intrinsic::amdgcn_raw_tbuffer_store:
11002 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11004 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11006 VData = handleD16VData(VData, DAG);
11007 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11008 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11026 M->getMemoryVT(),
M->getMemOperand());
11029 case Intrinsic::amdgcn_raw_buffer_store:
11030 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11031 case Intrinsic::amdgcn_raw_buffer_store_format:
11032 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11033 const bool IsFormat =
11034 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11035 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11042 VData = handleD16VData(VData, DAG);
11052 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11053 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11073 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
11076 M->getMemoryVT(),
M->getMemOperand());
11079 case Intrinsic::amdgcn_struct_buffer_store:
11080 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11081 case Intrinsic::amdgcn_struct_buffer_store_format:
11082 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11083 const bool IsFormat =
11084 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11085 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11093 VData = handleD16VData(VData, DAG);
11103 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11104 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11123 EVT VDataType = VData.getValueType().getScalarType();
11125 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
11128 M->getMemoryVT(),
M->getMemOperand());
11130 case Intrinsic::amdgcn_raw_buffer_load_lds:
11131 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11132 case Intrinsic::amdgcn_struct_buffer_load_lds:
11133 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11134 if (!Subtarget->hasVMemToLDSLoad())
11138 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11139 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11140 unsigned OpOffset = HasVIndex ? 1 : 0;
11141 SDValue VOffset =
Op.getOperand(5 + OpOffset);
11143 unsigned Size =
Op->getConstantOperandVal(4);
11149 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11150 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11151 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11152 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11155 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11156 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11157 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11158 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11161 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11162 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11163 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11164 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11167 if (!Subtarget->hasLDSLoadB96_B128())
11169 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11170 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11171 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11172 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11175 if (!Subtarget->hasLDSLoadB96_B128())
11177 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11178 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11179 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11180 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11188 if (HasVIndex && HasVOffset)
11192 else if (HasVIndex)
11193 Ops.push_back(
Op.getOperand(5));
11194 else if (HasVOffset)
11195 Ops.push_back(VOffset);
11197 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11198 Ops.push_back(Rsrc);
11199 Ops.push_back(
Op.getOperand(6 + OpOffset));
11200 Ops.push_back(
Op.getOperand(7 + OpOffset));
11202 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
11215 MachineMemOperand *LoadMMO =
M->getMemOperand();
11220 MachinePointerInfo StorePtrI = LoadPtrI;
11244 case Intrinsic::amdgcn_load_to_lds:
11245 case Intrinsic::amdgcn_global_load_lds: {
11246 if (!Subtarget->hasVMemToLDSLoad())
11250 unsigned Size =
Op->getConstantOperandVal(4);
11255 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11258 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11261 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11264 if (!Subtarget->hasLDSLoadB96_B128())
11266 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11269 if (!Subtarget->hasLDSLoadB96_B128())
11271 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11287 if (
LHS->isDivergent())
11291 RHS.getOperand(0).getValueType() == MVT::i32) {
11294 VOffset =
RHS.getOperand(0);
11298 Ops.push_back(Addr);
11306 Ops.push_back(VOffset);
11309 Ops.push_back(
Op.getOperand(5));
11310 Ops.push_back(
Op.getOperand(6));
11315 MachineMemOperand *LoadMMO =
M->getMemOperand();
11317 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
11318 MachinePointerInfo StorePtrI = LoadPtrI;
11337 case Intrinsic::amdgcn_end_cf:
11339 Op->getOperand(2), Chain),
11341 case Intrinsic::amdgcn_s_barrier_init:
11342 case Intrinsic::amdgcn_s_barrier_signal_var: {
11349 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11350 ? AMDGPU::S_BARRIER_INIT_M0
11351 : AMDGPU::S_BARRIER_SIGNAL_M0;
11366 constexpr unsigned ShAmt = 16;
11373 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11378 case Intrinsic::amdgcn_s_barrier_join: {
11387 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11390 unsigned BarID = (BarVal >> 4) & 0x3F;
11393 Ops.push_back(Chain);
11395 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11405 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11411 case Intrinsic::amdgcn_s_prefetch_data: {
11414 return Op.getOperand(0);
11417 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11419 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
11426 Op->getVTList(),
Ops,
M->getMemoryVT(),
11427 M->getMemOperand());
11429 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11430 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11431 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11440 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11442 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11458 return PtrVT == MVT::i64;
11472std::pair<SDValue, SDValue>
11502 unsigned Overflow = ImmOffset & ~MaxImm;
11503 ImmOffset -= Overflow;
11504 if ((int32_t)Overflow < 0) {
11505 Overflow += ImmOffset;
11510 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
11529void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
11531 Align Alignment)
const {
11533 SDLoc
DL(CombinedOffset);
11535 uint32_t
Imm =
C->getZExtValue();
11536 uint32_t SOffset, ImmOffset;
11537 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11547 uint32_t SOffset, ImmOffset;
11550 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
11558 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11567SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
11570 return MaybePointer;
11584 SDValue NumRecords =
Op->getOperand(3);
11590 if (Subtarget->has45BitNumRecordsBufferResource()) {
11609 SDValue ExtShiftedStrideVec =
11612 DAG.
getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
11619 DAG.
getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
11621 DAG.
getNode(
ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11623 DAG.
getNode(
ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11628 auto [LowHalf, HighHalf] =
11629 DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11639 NumRecords, Flags);
11642 SDValue RsrcPtr = DAG.
getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11651 bool IsTFE)
const {
11660 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
11675 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
11679 LoadVal = DAG.
getNode(ISD::BITCAST,
DL, LoadVT, LoadVal);
11689 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11693 Ops[1] = BufferStoreExt;
11698 M->getMemOperand());
11723 DAGCombinerInfo &DCI)
const {
11724 SelectionDAG &DAG = DCI.DAG;
11739 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
11746 "unexpected vector extload");
11759 "unexpected fp extload");
11777 DCI.AddToWorklist(Cvt.
getNode());
11782 DCI.AddToWorklist(Cvt.
getNode());
11785 Cvt = DAG.
getNode(ISD::BITCAST, SL, VT, Cvt);
11793 if (
Info.isEntryFunction())
11794 return Info.getUserSGPRInfo().hasFlatScratchInit();
11802 EVT MemVT =
Load->getMemoryVT();
11803 MachineMemOperand *MMO =
Load->getMemOperand();
11815 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11843 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
11844 "Custom lowering for non-i32 vectors hasn't been implemented.");
11847 unsigned AS =
Load->getAddressSpace();
11854 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
11858 !Subtarget->hasMultiDwordFlatScratchAddressing())
11868 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
11871 Alignment >=
Align(4) && NumElements < 32) {
11873 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11885 if (NumElements > 4)
11888 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11898 switch (Subtarget->getMaxPrivateElementSize()) {
11904 if (NumElements > 2)
11909 if (NumElements > 4)
11912 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11921 auto Flags =
Load->getMemOperand()->getFlags();
11923 Load->getAlign(), Flags, &
Fast) &&
11932 MemVT, *
Load->getMemOperand())) {
11941 EVT VT =
Op.getValueType();
11968 return DAG.
getNode(ISD::BITCAST,
DL, VT, Res);
11978 EVT VT =
Op.getValueType();
11979 const SDNodeFlags
Flags =
Op->getFlags();
11981 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
11987 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11990 if (CLHS->isExactlyValue(1.0)) {
12007 if (CLHS->isExactlyValue(-1.0)) {
12016 if (!AllowInaccurateRcp &&
12017 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
12031 EVT VT =
Op.getValueType();
12032 const SDNodeFlags
Flags =
Op->getFlags();
12034 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
12035 if (!AllowInaccurateDiv)
12056 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
12070 return DAG.
getNode(Opcode, SL, VTList,
12079 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
12093 return DAG.
getNode(Opcode, SL, VTList,
12099 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12100 return FastLowered;
12103 EVT VT =
Op.getValueType();
12110 if (VT == MVT::bf16) {
12133 unsigned FMADOpCode =
12135 SDValue NegRHSExt = DAG.
getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12140 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12142 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
12143 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12149 Tmp = DAG.
getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12159 SDNodeFlags
Flags =
Op->getFlags();
12166 const APFloat K0Val(0x1p+96f);
12169 const APFloat K1Val(0x1p-32f);
12196 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
12197 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
12198 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12203 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12204 return FastLowered;
12210 SDNodeFlags
Flags =
Op->getFlags();
12211 Flags.setNoFPExcept(
true);
12219 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
12230 DAG.
getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12232 using namespace AMDGPU::Hwreg;
12233 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12237 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
12238 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
12241 const bool HasDynamicDenormals =
12247 if (!PreservesDenormals) {
12252 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12255 if (HasDynamicDenormals) {
12259 SavedDenormMode =
SDValue(GetReg, 0);
12265 SDNode *EnableDenorm;
12266 if (Subtarget->hasDenormModeInst()) {
12267 const SDValue EnableDenormValue =
12274 const SDValue EnableDenormValue =
12276 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12277 {EnableDenormValue,
BitField, Glue});
12287 ApproxRcp, One, NegDivScale0, Flags);
12290 ApproxRcp, Fma0, Flags);
12296 NumeratorScaled,
Mul, Flags);
12302 NumeratorScaled, Fma3, Flags);
12304 if (!PreservesDenormals) {
12305 SDNode *DisableDenorm;
12306 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12310 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12316 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
12317 const SDValue DisableDenormValue =
12318 HasDynamicDenormals
12323 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12334 {Fma4, Fma1, Fma3, Scale},
Flags);
12340 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
12341 return FastLowered;
12349 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
12353 SDValue NegDivScale0 = DAG.
getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12373 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12382 SDValue Scale0BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12383 SDValue Scale1BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12409 EVT VT =
Op.getValueType();
12411 if (VT == MVT::f32)
12412 return LowerFDIV32(
Op, DAG);
12414 if (VT == MVT::f64)
12415 return LowerFDIV64(
Op, DAG);
12417 if (VT == MVT::f16 || VT == MVT::bf16)
12418 return LowerFDIV16(
Op, DAG);
12427 EVT ResultExpVT =
Op->getValueType(1);
12428 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12438 if (Subtarget->hasFractBug()) {
12456 EVT VT =
Store->getMemoryVT();
12458 if (VT == MVT::i1) {
12462 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
12466 Store->getValue().getValueType().getScalarType() == MVT::i32);
12468 unsigned AS =
Store->getAddressSpace();
12476 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12480 !Subtarget->hasMultiDwordFlatScratchAddressing())
12487 if (NumElements > 4)
12490 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12494 VT, *
Store->getMemOperand()))
12500 switch (Subtarget->getMaxPrivateElementSize()) {
12504 if (NumElements > 2)
12508 if (NumElements > 4 ||
12509 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12517 auto Flags =
Store->getMemOperand()->getFlags();
12536 assert(!Subtarget->has16BitInsts());
12537 SDNodeFlags
Flags =
Op->getFlags();
12539 DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32,
Op.getOperand(0), Flags);
12551 SDNodeFlags
Flags =
Op->getFlags();
12552 MVT VT =
Op.getValueType().getSimpleVT();
12582 SDValue SqrtSNextDown = DAG.
getNode(ISD::BITCAST,
DL, VT, SqrtSNextDownInt);
12585 DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextDown, Flags);
12594 SDValue NegSqrtSNextUp = DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextUp, Flags);
12660 SDNodeFlags
Flags =
Op->getFlags();
12706 SqrtRet = DAG.
getNode(ISD::FLDEXP,
DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12723 EVT VT =
Op.getValueType();
12733 if (Subtarget->hasTrigReducedRange()) {
12740 switch (
Op.getOpcode()) {
12767 EVT VT =
Op.getValueType();
12775 Op->getVTList(),
Ops, VT,
12784SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
12785 DAGCombinerInfo &DCI)
const {
12786 EVT VT =
N->getValueType(0);
12788 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12791 SelectionDAG &DAG = DCI.DAG;
12795 EVT SrcVT = Src.getValueType();
12801 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12804 DCI.AddToWorklist(Cvt.
getNode());
12807 if (ScalarVT != MVT::f32) {
12819 DAGCombinerInfo &DCI)
const {
12826 if (SignOp.
getOpcode() == ISD::FP_EXTEND ||
12830 SelectionDAG &DAG = DCI.DAG;
12849 for (
unsigned I = 0;
I != NumElts; ++
I) {
12873 if (NewElts.
size() == 1)
12895 for (
unsigned I = 0;
I != NumElts; ++
I) {
12930SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
12932 DAGCombinerInfo &DCI)
const {
12949 SelectionDAG &DAG = DCI.DAG;
12962 AM.BaseOffs =
Offset.getSExtValue();
12967 EVT VT =
N->getValueType(0);
12973 Flags.setNoUnsignedWrap(
12974 N->getFlags().hasNoUnsignedWrap() &&
12986 switch (
N->getOpcode()) {
12997 DAGCombinerInfo &DCI)
const {
12998 SelectionDAG &DAG = DCI.DAG;
13005 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
13006 N->getMemoryVT(), DCI);
13010 NewOps[PtrIdx] = NewPtr;
13019 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13020 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13029SDValue SITargetLowering::splitBinaryBitConstantOp(
13033 uint32_t ValLo =
Lo_32(Val);
13034 uint32_t ValHi =
Hi_32(Val);
13041 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
13055 if (V.getValueType() != MVT::i1)
13057 switch (V.getOpcode()) {
13074 return V.getResNo() == 1;
13076 unsigned IntrinsicID = V.getConstantOperandVal(0);
13077 switch (IntrinsicID) {
13078 case Intrinsic::amdgcn_is_shared:
13079 case Intrinsic::amdgcn_is_private:
13096 if (!(
C & 0x000000ff))
13097 ZeroByteMask |= 0x000000ff;
13098 if (!(
C & 0x0000ff00))
13099 ZeroByteMask |= 0x0000ff00;
13100 if (!(
C & 0x00ff0000))
13101 ZeroByteMask |= 0x00ff0000;
13102 if (!(
C & 0xff000000))
13103 ZeroByteMask |= 0xff000000;
13104 uint32_t NonZeroByteMask = ~ZeroByteMask;
13105 if ((NonZeroByteMask &
C) != NonZeroByteMask)
13118 assert(V.getValueSizeInBits() == 32);
13120 if (V.getNumOperands() != 2)
13129 switch (V.getOpcode()) {
13134 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13139 return (0x03020100 & ~ConstMask) | ConstMask;
13146 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
13152 return uint32_t(0x0c0c0c0c03020100ull >>
C);
13159 DAGCombinerInfo &DCI)
const {
13160 if (DCI.isBeforeLegalize())
13163 SelectionDAG &DAG = DCI.DAG;
13164 EVT VT =
N->getValueType(0);
13169 if (VT == MVT::i64 && CRHS) {
13171 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
13175 if (CRHS && VT == MVT::i32) {
13185 unsigned Shift = CShift->getZExtValue();
13187 unsigned Offset = NB + Shift;
13188 if ((
Offset & (Bits - 1)) == 0) {
13212 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13227 if (
Y.getOpcode() != ISD::FABS ||
Y.getOperand(0) !=
X ||
13232 if (
X !=
LHS.getOperand(1))
13236 const ConstantFPSDNode *C1 =
13270 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
13271 LHS.getOperand(0) ==
LHS.getOperand(1))) {
13273 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
13274 :
Mask->getZExtValue() & OrdMask;
13295 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13298 if (LHSMask != ~0u && RHSMask != ~0u) {
13301 if (LHSMask > RHSMask) {
13308 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13309 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13312 if (!(LHSUsedLanes & RHSUsedLanes) &&
13315 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13321 uint32_t
Mask = LHSMask & RHSMask;
13322 for (
unsigned I = 0;
I < 32;
I += 8) {
13323 uint32_t ByteSel = 0xff <<
I;
13324 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13325 Mask &= (0x0c <<
I) & 0xffffffff;
13330 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
13383static const std::optional<ByteProvider<SDValue>>
13385 unsigned Depth = 0) {
13388 return std::nullopt;
13390 if (
Op.getValueSizeInBits() < 8)
13391 return std::nullopt;
13393 if (
Op.getValueType().isVector())
13396 switch (
Op->getOpcode()) {
13408 NarrowVT = VTSign->getVT();
13411 return std::nullopt;
13414 if (SrcIndex >= NarrowByteWidth)
13415 return std::nullopt;
13423 return std::nullopt;
13425 uint64_t BitShift = ShiftOp->getZExtValue();
13427 if (BitShift % 8 != 0)
13428 return std::nullopt;
13430 SrcIndex += BitShift / 8;
13448static const std::optional<ByteProvider<SDValue>>
13450 unsigned StartingIndex = 0) {
13454 return std::nullopt;
13456 unsigned BitWidth =
Op.getScalarValueSizeInBits();
13458 return std::nullopt;
13460 return std::nullopt;
13462 bool IsVec =
Op.getValueType().isVector();
13463 switch (
Op.getOpcode()) {
13466 return std::nullopt;
13471 return std::nullopt;
13475 return std::nullopt;
13478 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
13479 return std::nullopt;
13480 if (!
LHS ||
LHS->isConstantZero())
13482 if (!
RHS ||
RHS->isConstantZero())
13484 return std::nullopt;
13489 return std::nullopt;
13493 return std::nullopt;
13495 uint32_t BitMask = BitMaskOp->getZExtValue();
13497 uint32_t IndexMask = 0xFF << (Index * 8);
13499 if ((IndexMask & BitMask) != IndexMask) {
13502 if (IndexMask & BitMask)
13503 return std::nullopt;
13512 return std::nullopt;
13516 if (!ShiftOp ||
Op.getValueType().isVector())
13517 return std::nullopt;
13519 uint64_t BitsProvided =
Op.getValueSizeInBits();
13520 if (BitsProvided % 8 != 0)
13521 return std::nullopt;
13523 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13525 return std::nullopt;
13527 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13528 uint64_t ByteShift = BitShift / 8;
13530 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13531 uint64_t BytesProvided = BitsProvided / 8;
13532 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13533 NewIndex %= BytesProvided;
13540 return std::nullopt;
13544 return std::nullopt;
13546 uint64_t BitShift = ShiftOp->getZExtValue();
13548 return std::nullopt;
13550 auto BitsProvided =
Op.getScalarValueSizeInBits();
13551 if (BitsProvided % 8 != 0)
13552 return std::nullopt;
13554 uint64_t BytesProvided = BitsProvided / 8;
13555 uint64_t ByteShift = BitShift / 8;
13560 return BytesProvided - ByteShift > Index
13568 return std::nullopt;
13572 return std::nullopt;
13574 uint64_t BitShift = ShiftOp->getZExtValue();
13575 if (BitShift % 8 != 0)
13576 return std::nullopt;
13577 uint64_t ByteShift = BitShift / 8;
13583 return Index < ByteShift
13586 Depth + 1, StartingIndex);
13595 return std::nullopt;
13603 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13605 if (NarrowBitWidth % 8 != 0)
13606 return std::nullopt;
13607 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13609 if (Index >= NarrowByteWidth)
13611 ? std::optional<ByteProvider<SDValue>>(
13619 return std::nullopt;
13623 if (NarrowByteWidth >= Index) {
13628 return std::nullopt;
13635 return std::nullopt;
13641 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13642 if (NarrowBitWidth % 8 != 0)
13643 return std::nullopt;
13644 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13649 if (Index >= NarrowByteWidth) {
13651 ? std::optional<ByteProvider<SDValue>>(
13656 if (NarrowByteWidth > Index) {
13660 return std::nullopt;
13665 return std::nullopt;
13668 Depth + 1, StartingIndex);
13674 return std::nullopt;
13675 auto VecIdx = IdxOp->getZExtValue();
13676 auto ScalarSize =
Op.getScalarValueSizeInBits();
13677 if (ScalarSize < 32)
13678 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13680 StartingIndex, Index);
13685 return std::nullopt;
13689 return std::nullopt;
13692 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13693 if (IdxMask > 0x07 && IdxMask != 0x0c)
13694 return std::nullopt;
13696 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13697 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13699 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
13705 return std::nullopt;
13720 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
13727 auto MemVT = L->getMemoryVT();
13730 return L->getMemoryVT().getSizeInBits() == 16;
13740 int Low8 = Mask & 0xff;
13741 int Hi8 = (Mask & 0xff00) >> 8;
13743 assert(Low8 < 8 && Hi8 < 8);
13745 bool IsConsecutive = (Hi8 - Low8 == 1);
13750 bool Is16Aligned = !(Low8 % 2);
13752 return IsConsecutive && Is16Aligned;
13760 int Low16 = PermMask & 0xffff;
13761 int Hi16 = (PermMask & 0xffff0000) >> 16;
13771 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13773 if (!OtherOpIs16Bit)
13781 unsigned DWordOffset) {
13786 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13791 if (Src.getValueType().isVector()) {
13792 auto ScalarTySize = Src.getScalarValueSizeInBits();
13793 auto ScalarTy = Src.getValueType().getScalarType();
13794 if (ScalarTySize == 32) {
13798 if (ScalarTySize > 32) {
13801 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13802 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13809 assert(ScalarTySize < 32);
13810 auto NumElements =
TypeSize / ScalarTySize;
13811 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13812 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13813 auto NumElementsIn32 = 32 / ScalarTySize;
13814 auto NumAvailElements = DWordOffset < Trunc32Elements
13816 : NumElements - NormalizedTrunc;
13829 auto ShiftVal = 32 * DWordOffset;
13837 [[maybe_unused]]
EVT VT =
N->getValueType(0);
13842 for (
int i = 0; i < 4; i++) {
13844 std::optional<ByteProvider<SDValue>>
P =
13847 if (!
P ||
P->isConstantZero())
13852 if (PermNodes.
size() != 4)
13855 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13856 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13858 for (
size_t i = 0; i < PermNodes.
size(); i++) {
13859 auto PermOp = PermNodes[i];
13862 int SrcByteAdjust = 4;
13866 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13867 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13869 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13870 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13874 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13875 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13878 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13880 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13883 SDValue Op = *PermNodes[FirstSrc.first].Src;
13885 assert(
Op.getValueSizeInBits() == 32);
13889 int Low16 = PermMask & 0xffff;
13890 int Hi16 = (PermMask & 0xffff0000) >> 16;
13892 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13893 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13896 if (WellFormedLow && WellFormedHi)
13900 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
13909 assert(
Op.getValueType().isByteSized() &&
13927 DAGCombinerInfo &DCI)
const {
13928 SelectionDAG &DAG = DCI.DAG;
13932 EVT VT =
N->getValueType(0);
13933 if (VT == MVT::i1) {
13938 if (Src !=
RHS.getOperand(0))
13943 if (!CLHS || !CRHS)
13947 static const uint32_t MaxMask = 0x3ff;
13967 Sel |=
LHS.getConstantOperandVal(2);
13976 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13980 auto usesCombinedOperand = [](SDNode *OrUse) {
13982 if (OrUse->getOpcode() != ISD::BITCAST ||
13983 !OrUse->getValueType(0).isVector())
13987 for (
auto *VUser : OrUse->users()) {
13988 if (!VUser->getValueType(0).isVector())
13995 if (VUser->getOpcode() == VectorwiseOp)
14001 if (!
any_of(
N->users(), usesCombinedOperand))
14007 if (LHSMask != ~0u && RHSMask != ~0u) {
14010 if (LHSMask > RHSMask) {
14017 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14018 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14021 if (!(LHSUsedLanes & RHSUsedLanes) &&
14024 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14026 LHSMask &= ~RHSUsedLanes;
14027 RHSMask &= ~LHSUsedLanes;
14029 LHSMask |= LHSUsedLanes & 0x04040404;
14031 uint32_t Sel = LHSMask | RHSMask;
14039 if (LHSMask == ~0u || RHSMask == ~0u) {
14080 return IdentitySrc;
14086 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14101 if (SrcVT == MVT::i32) {
14106 DCI.AddToWorklist(LowOr.
getNode());
14107 DCI.AddToWorklist(HiBits.getNode());
14111 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14118 N->getOperand(0), CRHS))
14126 DAGCombinerInfo &DCI)
const {
14127 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
14134 SelectionDAG &DAG = DCI.DAG;
14136 EVT VT =
N->getValueType(0);
14137 if (CRHS && VT == MVT::i64) {
14139 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
14146 unsigned Opc =
LHS.getOpcode();
14170 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(1));
14172 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(2));
14176 LHS->getOperand(0), FNegLHS, FNegRHS);
14177 return DAG.
getNode(ISD::BITCAST,
DL, VT, NewSelect);
14185 DAGCombinerInfo &DCI)
const {
14186 if (!Subtarget->has16BitInsts() ||
14190 EVT VT =
N->getValueType(0);
14191 if (VT != MVT::i32)
14195 if (Src.getValueType() != MVT::i16)
14202SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
14203 DAGCombinerInfo &DCI)
const {
14210 VTSign->getVT() == MVT::i8) ||
14212 VTSign->getVT() == MVT::i16))) {
14213 assert(Subtarget->hasScalarSubwordLoads() &&
14214 "s_buffer_load_{u8, i8} are supported "
14215 "in GFX12 (or newer) architectures.");
14216 EVT VT = Src.getValueType();
14221 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14228 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14229 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14234 VTSign->getVT() == MVT::i8) ||
14236 VTSign->getVT() == MVT::i16)) &&
14245 Src.getOperand(6), Src.getOperand(7)};
14248 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14252 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14253 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14254 return DCI.DAG.getMergeValues(
14255 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
14261 DAGCombinerInfo &DCI)
const {
14262 SelectionDAG &DAG = DCI.DAG;
14269 if (
N->getOperand(0).isUndef())
14276 DAGCombinerInfo &DCI)
const {
14277 EVT VT =
N->getValueType(0);
14292 if ((VT == MVT::f16 && N0.
getOpcode() == ISD::FSQRT) &&
14302 unsigned MaxDepth)
const {
14303 unsigned Opcode =
Op.getOpcode();
14308 const auto &
F = CFP->getValueAPF();
14309 if (
F.isNaN() &&
F.isSignaling())
14311 if (!
F.isDenormal())
14337 case ISD::FP_EXTEND:
14338 case ISD::FP16_TO_FP:
14339 case ISD::FP_TO_FP16:
14340 case ISD::BF16_TO_FP:
14341 case ISD::FP_TO_BF16:
14374 if (
Op.getValueType() == MVT::i32) {
14380 if (RHS->getZExtValue() == 0xffff0000) {
14390 return Op.getValueType().getScalarType() != MVT::f16;
14394 case ISD::FMINNUM_IEEE:
14395 case ISD::FMAXNUM_IEEE:
14396 case ISD::FMINIMUM:
14397 case ISD::FMAXIMUM:
14398 case ISD::FMINIMUMNUM:
14399 case ISD::FMAXIMUMNUM:
14411 if (Subtarget->supportsMinMaxDenormModes() ||
14421 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
14433 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
14460 if (
Op.getValueType() == MVT::i16) {
14463 TruncSrc.
getOpcode() == ISD::BITCAST &&
14471 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
14473 switch (IntrinsicID) {
14474 case Intrinsic::amdgcn_cvt_pkrtz:
14475 case Intrinsic::amdgcn_cubeid:
14476 case Intrinsic::amdgcn_frexp_mant:
14477 case Intrinsic::amdgcn_fdot2:
14478 case Intrinsic::amdgcn_rcp:
14479 case Intrinsic::amdgcn_rsq:
14480 case Intrinsic::amdgcn_rsq_clamp:
14481 case Intrinsic::amdgcn_rcp_legacy:
14482 case Intrinsic::amdgcn_rsq_legacy:
14483 case Intrinsic::amdgcn_trig_preop:
14484 case Intrinsic::amdgcn_tanh:
14485 case Intrinsic::amdgcn_log:
14486 case Intrinsic::amdgcn_exp2:
14487 case Intrinsic::amdgcn_sqrt:
14505 unsigned MaxDepth)
const {
14508 unsigned Opcode =
MI->getOpcode();
14510 if (Opcode == AMDGPU::G_FCANONICALIZE)
14513 std::optional<FPValueAndVReg> FCR;
14516 if (FCR->Value.isSignaling())
14518 if (!FCR->Value.isDenormal())
14529 case AMDGPU::G_FADD:
14530 case AMDGPU::G_FSUB:
14531 case AMDGPU::G_FMUL:
14532 case AMDGPU::G_FCEIL:
14533 case AMDGPU::G_FFLOOR:
14534 case AMDGPU::G_FRINT:
14535 case AMDGPU::G_FNEARBYINT:
14536 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14537 case AMDGPU::G_INTRINSIC_TRUNC:
14538 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14539 case AMDGPU::G_FMA:
14540 case AMDGPU::G_FMAD:
14541 case AMDGPU::G_FSQRT:
14542 case AMDGPU::G_FDIV:
14543 case AMDGPU::G_FREM:
14544 case AMDGPU::G_FPOW:
14545 case AMDGPU::G_FPEXT:
14546 case AMDGPU::G_FLOG:
14547 case AMDGPU::G_FLOG2:
14548 case AMDGPU::G_FLOG10:
14549 case AMDGPU::G_FPTRUNC:
14550 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14551 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14552 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14553 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14554 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14556 case AMDGPU::G_FNEG:
14557 case AMDGPU::G_FABS:
14558 case AMDGPU::G_FCOPYSIGN:
14560 case AMDGPU::G_FMINNUM:
14561 case AMDGPU::G_FMAXNUM:
14562 case AMDGPU::G_FMINNUM_IEEE:
14563 case AMDGPU::G_FMAXNUM_IEEE:
14564 case AMDGPU::G_FMINIMUM:
14565 case AMDGPU::G_FMAXIMUM:
14566 case AMDGPU::G_FMINIMUMNUM:
14567 case AMDGPU::G_FMAXIMUMNUM: {
14568 if (Subtarget->supportsMinMaxDenormModes() ||
14575 case AMDGPU::G_BUILD_VECTOR:
14580 case AMDGPU::G_INTRINSIC:
14581 case AMDGPU::G_INTRINSIC_CONVERGENT:
14583 case Intrinsic::amdgcn_fmul_legacy:
14584 case Intrinsic::amdgcn_fmad_ftz:
14585 case Intrinsic::amdgcn_sqrt:
14586 case Intrinsic::amdgcn_fmed3:
14587 case Intrinsic::amdgcn_sin:
14588 case Intrinsic::amdgcn_cos:
14589 case Intrinsic::amdgcn_log:
14590 case Intrinsic::amdgcn_exp2:
14591 case Intrinsic::amdgcn_log_clamp:
14592 case Intrinsic::amdgcn_rcp:
14593 case Intrinsic::amdgcn_rcp_legacy:
14594 case Intrinsic::amdgcn_rsq:
14595 case Intrinsic::amdgcn_rsq_clamp:
14596 case Intrinsic::amdgcn_rsq_legacy:
14597 case Intrinsic::amdgcn_div_scale:
14598 case Intrinsic::amdgcn_div_fmas:
14599 case Intrinsic::amdgcn_div_fixup:
14600 case Intrinsic::amdgcn_fract:
14601 case Intrinsic::amdgcn_cvt_pkrtz:
14602 case Intrinsic::amdgcn_cubeid:
14603 case Intrinsic::amdgcn_cubema:
14604 case Intrinsic::amdgcn_cubesc:
14605 case Intrinsic::amdgcn_cubetc:
14606 case Intrinsic::amdgcn_frexp_mant:
14607 case Intrinsic::amdgcn_fdot2:
14608 case Intrinsic::amdgcn_trig_preop:
14609 case Intrinsic::amdgcn_tanh:
14628 if (
C.isDenormal()) {
14642 if (
C.isSignaling()) {
14665SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
14666 DAGCombinerInfo &DCI)
const {
14667 SelectionDAG &DAG = DCI.DAG;
14669 EVT VT =
N->getValueType(0);
14678 EVT VT =
N->getValueType(0);
14679 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
14695 EVT EltVT =
Lo.getValueType();
14698 for (
unsigned I = 0;
I != 2; ++
I) {
14702 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14703 }
else if (
Op.isUndef()) {
14737 case ISD::FMAXNUM_IEEE:
14738 case ISD::FMAXIMUMNUM:
14740 case ISD::FMAXIMUM:
14747 case ISD::FMINNUM_IEEE:
14748 case ISD::FMINIMUMNUM:
14750 case ISD::FMINIMUM:
14776 if (!MinK || !MaxK)
14789 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14790 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14849 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
14855 if (
Info->getMode().DX10Clamp) {
14864 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14892 case ISD::FMINNUM_IEEE:
14893 case ISD::FMAXNUM_IEEE:
14894 case ISD::FMINIMUMNUM:
14895 case ISD::FMAXIMUMNUM:
14898 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
14900 case ISD::FMINIMUM:
14901 case ISD::FMAXIMUM:
14909 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
14918 DAGCombinerInfo &DCI)
const {
14919 SelectionDAG &DAG = DCI.DAG;
14951 if (
SDValue Med3 = performIntMed3ImmCombine(
14956 if (
SDValue Med3 = performIntMed3ImmCombine(
14962 if (
SDValue Med3 = performIntMed3ImmCombine(
14967 if (
SDValue Med3 = performIntMed3ImmCombine(
14977 if (((
Opc == ISD::FMINNUM && Op0.
getOpcode() == ISD::FMAXNUM) ||
14978 (
Opc == ISD::FMINNUM_IEEE && Op0.
getOpcode() == ISD::FMAXNUM_IEEE) ||
14979 (
Opc == ISD::FMINIMUMNUM && Op0.
getOpcode() == ISD::FMAXIMUMNUM) ||
14982 (VT == MVT::f32 || VT == MVT::f64 ||
14983 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14984 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14985 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14986 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14988 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1))
14995 const SDNodeFlags
Flags =
N->getFlags();
14996 if ((
Opc == ISD::FMINIMUM ||
Opc == ISD::FMAXIMUM) &&
14997 !Subtarget->hasIEEEMinimumMaximumInsts() &&
Flags.hasNoNaNs()) {
14999 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
15000 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
15010 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15011 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15020 DAGCombinerInfo &DCI)
const {
15021 EVT VT =
N->getValueType(0);
15025 SelectionDAG &DAG = DCI.DAG;
15040 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15044 if (
Info->getMode().DX10Clamp) {
15064 DAGCombinerInfo &DCI)
const {
15068 return DCI.DAG.getUNDEF(
N->getValueType(0));
15076 bool IsDivergentIdx,
15081 unsigned VecSize = EltSize * NumElem;
15084 if (VecSize <= 64 && EltSize < 32)
15093 if (IsDivergentIdx)
15097 unsigned NumInsts = NumElem +
15098 ((EltSize + 31) / 32) * NumElem ;
15102 if (Subtarget->useVGPRIndexMode())
15103 return NumInsts <= 16;
15107 if (Subtarget->hasMovrel())
15108 return NumInsts <= 15;
15114 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
15129SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
15130 DAGCombinerInfo &DCI)
const {
15136 EVT ResVT =
N->getValueType(0);
15160 if (!
C ||
C->getZExtValue() != 0x1f)
15176 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15195 case ISD::FMAXNUM_IEEE:
15196 case ISD::FMINNUM_IEEE:
15197 case ISD::FMAXIMUM:
15198 case ISD::FMINIMUM: {
15204 DCI.AddToWorklist(Elt0.
getNode());
15205 DCI.AddToWorklist(Elt1.
getNode());
15227 if (!DCI.isBeforeLegalize())
15235 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15238 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15239 unsigned EltIdx = BitIndex / 32;
15240 unsigned LeftoverBitIdx = BitIndex % 32;
15244 DCI.AddToWorklist(Cast.
getNode());
15248 DCI.AddToWorklist(Elt.
getNode());
15251 DCI.AddToWorklist(Srl.
getNode());
15255 DCI.AddToWorklist(Trunc.
getNode());
15257 if (VecEltVT == ResVT) {
15258 return DAG.
getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15269SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
15270 DAGCombinerInfo &DCI)
const {
15281 SelectionDAG &DAG = DCI.DAG;
15300 if (Src.getOpcode() == ISD::FP_EXTEND &&
15301 Src.getOperand(0).getValueType() == MVT::f16) {
15302 return Src.getOperand(0);
15306 APFloat Val = CFP->getValueAPF();
15307 bool LosesInfo =
true;
15317 DAGCombinerInfo &DCI)
const {
15318 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15319 "combine only useful on gfx8");
15321 SDValue TruncSrc =
N->getOperand(0);
15322 EVT VT =
N->getValueType(0);
15323 if (VT != MVT::f16)
15330 SelectionDAG &DAG = DCI.DAG;
15358 return DAG.
getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15361unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
15363 const SDNode *N1)
const {
15368 if (((VT == MVT::f32 &&
15370 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15390 EVT VT =
N->getValueType(0);
15391 if (VT != MVT::i32 && VT != MVT::i64)
15397 unsigned Opc =
N->getOpcode();
15452 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
15471 DAGCombinerInfo &DCI)
const {
15474 SelectionDAG &DAG = DCI.DAG;
15475 EVT VT =
N->getValueType(0);
15485 if (!
N->isDivergent() && Subtarget->hasSMulHi())
15489 if (NumBits <= 32 || NumBits > 64)
15500 if (!Subtarget->hasFullRate64Ops()) {
15501 unsigned NumUsers = 0;
15502 for (SDNode *User :
LHS->
users()) {
15505 if (!
User->isAnyAdd())
15529 bool MulSignedLo =
false;
15530 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15539 if (VT != MVT::i64) {
15562 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15564 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15565 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15567 if (!MulLHSUnsigned32) {
15574 if (!MulRHSUnsigned32) {
15585 if (VT != MVT::i64)
15591SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
15592 DAGCombinerInfo &DCI)
const {
15602 SelectionDAG &DAG = DCI.DAG;
15617 unsigned Opcode =
N->getOpcode();
15618 if (Opcode == ISD::PTRADD)
15621 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
15632static std::optional<ByteProvider<SDValue>>
15635 if (!Byte0 || Byte0->isConstantZero()) {
15636 return std::nullopt;
15639 if (Byte1 && !Byte1->isConstantZero()) {
15640 return std::nullopt;
15646 unsigned FirstCs =
First & 0x0c0c0c0c;
15647 unsigned SecondCs = Second & 0x0c0c0c0c;
15648 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
15649 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15651 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15652 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15653 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15654 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15656 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15680 for (
int BPI = 0; BPI < 2; BPI++) {
15683 BPP = {Src1, Src0};
15685 unsigned ZeroMask = 0x0c0c0c0c;
15686 unsigned FMask = 0xFF << (8 * (3 - Step));
15688 unsigned FirstMask =
15689 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15690 unsigned SecondMask =
15691 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15695 int FirstGroup = -1;
15696 for (
int I = 0;
I < 2;
I++) {
15698 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
15699 return IterElt.SrcOp == *BPP.first.Src &&
15700 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15704 if (Match != Srcs.
end()) {
15705 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
15710 if (FirstGroup != -1) {
15712 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
15713 return IterElt.SrcOp == *BPP.second.Src &&
15714 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15717 if (Match != Srcs.
end()) {
15718 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
15720 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15728 unsigned ZeroMask = 0x0c0c0c0c;
15729 unsigned FMask = 0xFF << (8 * (3 - Step));
15733 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15737 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15746 if (Srcs.
size() == 1) {
15747 auto *Elt = Srcs.
begin();
15751 if (Elt->PermMask == 0x3020100)
15758 auto *FirstElt = Srcs.
begin();
15759 auto *SecondElt = std::next(FirstElt);
15766 auto FirstMask = FirstElt->PermMask;
15767 auto SecondMask = SecondElt->PermMask;
15769 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15770 unsigned FirstPlusFour = FirstMask | 0x04040404;
15773 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15785 FirstElt = std::next(SecondElt);
15786 if (FirstElt == Srcs.
end())
15789 SecondElt = std::next(FirstElt);
15792 if (SecondElt == Srcs.
end()) {
15798 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
15804 return Perms.
size() == 2
15810 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15811 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15812 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15813 EntryMask += ZeroMask;
15818 auto Opcode =
Op.getOpcode();
15824static std::optional<bool>
15835 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15838 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15840 assert(!(S0IsUnsigned && S0IsSigned));
15841 assert(!(S1IsUnsigned && S1IsSigned));
15849 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15855 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15856 return std::nullopt;
15868 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15869 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15874 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15880 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15881 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15882 return std::nullopt;
15888 DAGCombinerInfo &DCI)
const {
15889 SelectionDAG &DAG = DCI.DAG;
15890 EVT VT =
N->getValueType(0);
15896 if (Subtarget->hasMad64_32()) {
15897 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
15902 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
15906 if (VT == MVT::i64) {
15907 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15912 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15914 std::optional<bool> IsSigned;
15920 int ChainLength = 0;
15921 for (
int I = 0;
I < 4;
I++) {
15925 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15928 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15933 TempNode->getOperand(MulIdx), *Src0, *Src1,
15934 TempNode->getOperand(MulIdx)->getOperand(0),
15935 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15939 IsSigned = *IterIsSigned;
15940 if (*IterIsSigned != *IsSigned)
15943 auto AddIdx = 1 - MulIdx;
15946 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
15947 Src2s.
push_back(TempNode->getOperand(AddIdx));
15957 TempNode->getOperand(AddIdx), *Src0, *Src1,
15958 TempNode->getOperand(AddIdx)->getOperand(0),
15959 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15963 if (*IterIsSigned != *IsSigned)
15967 ChainLength =
I + 2;
15971 TempNode = TempNode->getOperand(AddIdx);
15973 ChainLength =
I + 1;
15974 if (TempNode->getNumOperands() < 2)
15976 LHS = TempNode->getOperand(0);
15977 RHS = TempNode->getOperand(1);
15980 if (ChainLength < 2)
15986 if (ChainLength < 4) {
15996 bool UseOriginalSrc =
false;
15997 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
15998 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
15999 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
16000 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
16001 SmallVector<unsigned, 4> SrcBytes;
16002 auto Src0Mask = Src0s.
begin()->PermMask;
16003 SrcBytes.
push_back(Src0Mask & 0xFF000000);
16004 bool UniqueEntries =
true;
16005 for (
auto I = 1;
I < 4;
I++) {
16006 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
16009 UniqueEntries =
false;
16015 if (UniqueEntries) {
16016 UseOriginalSrc =
true;
16018 auto *FirstElt = Src0s.
begin();
16022 auto *SecondElt = Src1s.
begin();
16024 SecondElt->DWordOffset);
16033 if (!UseOriginalSrc) {
16040 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16043 : Intrinsic::amdgcn_udot4,
16053 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16058 unsigned Opc =
LHS.getOpcode();
16070 auto Cond =
RHS.getOperand(0);
16075 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16092 DAGCombinerInfo &DCI)
const {
16093 SelectionDAG &DAG = DCI.DAG;
16095 EVT VT =
N->getValueType(0);
16108 SDNodeFlags ShlFlags = N1->
getFlags();
16112 SDNodeFlags NewShlFlags =
16117 DCI.AddToWorklist(Inner.
getNode());
16124 if (Subtarget->hasMad64_32()) {
16125 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16134 if (VT == MVT::i64) {
16135 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16148 if (!YIsConstant && !ZIsConstant && !
X->isDivergent() &&
16149 Y->isDivergent() !=
Z->isDivergent()) {
16158 if (
Y->isDivergent())
16161 SDNodeFlags ReassocFlags =
16164 DCI.AddToWorklist(UniformInner.
getNode());
16172 DAGCombinerInfo &DCI)
const {
16173 SelectionDAG &DAG = DCI.DAG;
16174 EVT VT =
N->getValueType(0);
16176 if (VT == MVT::i64) {
16177 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16181 if (VT != MVT::i32)
16190 unsigned Opc =
RHS.getOpcode();
16197 auto Cond =
RHS.getOperand(0);
16202 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16220SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
16221 DAGCombinerInfo &DCI)
const {
16223 if (
N->getValueType(0) != MVT::i32)
16229 SelectionDAG &DAG = DCI.DAG;
16234 unsigned LHSOpc =
LHS.getOpcode();
16235 unsigned Opc =
N->getOpcode();
16239 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
16245 DAGCombinerInfo &DCI)
const {
16249 SelectionDAG &DAG = DCI.DAG;
16250 EVT VT =
N->getValueType(0);
16262 if (
A ==
LHS.getOperand(1)) {
16263 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16264 if (FusedOp != 0) {
16266 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
16274 if (
A ==
RHS.getOperand(1)) {
16275 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16276 if (FusedOp != 0) {
16278 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
16287 DAGCombinerInfo &DCI)
const {
16291 SelectionDAG &DAG = DCI.DAG;
16293 EVT VT =
N->getValueType(0);
16306 if (
A ==
LHS.getOperand(1)) {
16307 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16308 if (FusedOp != 0) {
16312 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
16321 if (
A ==
RHS.getOperand(1)) {
16322 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16323 if (FusedOp != 0) {
16325 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
16334 DAGCombinerInfo &DCI)
const {
16335 SelectionDAG &DAG = DCI.DAG;
16337 EVT VT =
N->getValueType(0);
16338 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16344 SDNodeFlags
Flags =
N->getFlags();
16345 SDNodeFlags RHSFlags =
RHS->getFlags();
16351 bool IsNegative =
false;
16352 if (CLHS->isExactlyValue(1.0) ||
16353 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16356 if (
RHS.getOpcode() == ISD::FSQRT) {
16360 return IsNegative ? DAG.
getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16369 DAGCombinerInfo &DCI)
const {
16370 SelectionDAG &DAG = DCI.DAG;
16371 EVT VT =
N->getValueType(0);
16375 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
16376 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16391 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16396 const ConstantFPSDNode *FalseNode =
16406 if (ScalarVT == MVT::f32 &&
16412 if (TrueNodeExpVal == INT_MIN)
16415 if (FalseNodeExpVal == INT_MIN)
16428 return DAG.
getNode(ISD::FLDEXP, SL, VT,
LHS, SelectNode,
N->getFlags());
16435 DAGCombinerInfo &DCI)
const {
16436 SelectionDAG &DAG = DCI.DAG;
16437 EVT VT =
N->getValueType(0);
16440 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16458 (
N->getFlags().hasAllowContract() &&
16459 FMA->getFlags().hasAllowContract())) {
16474 if (FMAOp1.
getOpcode() != ISD::FP_EXTEND ||
16493 if (Vec1 == Vec2 || Vec3 == Vec4)
16499 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16508 DAGCombinerInfo &DCI)
const {
16509 SelectionDAG &DAG = DCI.DAG;
16514 EVT VT =
LHS.getValueType();
16543 return LHS.getOperand(0);
16551 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
16558 const APInt &CT =
LHS.getConstantOperandAPInt(1);
16559 const APInt &CF =
LHS.getConstantOperandAPInt(2);
16567 return LHS.getOperand(0);
16599 DAG.
getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
16604 {Op0Hi, Op1Hi, CarryInHi});
16614 DCI.CombineTo(
LHS.getNode(), Result);
16618 if (VT != MVT::f32 && VT != MVT::f64 &&
16619 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16627 LHS.getOpcode() == ISD::FABS) {
16634 const unsigned IsInfMask =
16636 const unsigned IsFiniteMask =
16650SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
16651 DAGCombinerInfo &DCI)
const {
16652 SelectionDAG &DAG = DCI.DAG;
16673 unsigned ShiftOffset = 8 *
Offset;
16675 ShiftOffset -=
C->getZExtValue();
16677 ShiftOffset +=
C->getZExtValue();
16679 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16681 MVT::f32, Shifted);
16692 DCI.AddToWorklist(
N);
16699 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
16705 DAGCombinerInfo &DCI)
const {
16710 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16714 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16715 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
16718 APFloat One(
F.getSemantics(),
"1.0");
16720 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
16726 DAGCombinerInfo &DCI)
const {
16747 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
16748 bool isInteger =
LHS.getValueType().isInteger();
16751 if (!isFloatingPoint && !isInteger)
16756 if (!isEquality && !isNonEquality)
16773 if (isFloatingPoint) {
16775 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16786 if (!(isEquality && TrueVal == ConstVal) &&
16787 !(isNonEquality && FalseVal == ConstVal))
16794 SelectLHS, SelectRHS);
16799 switch (
N->getOpcode()) {
16815 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
16825 switch (
N->getOpcode()) {
16827 return performAddCombine(
N, DCI);
16829 return performPtrAddCombine(
N, DCI);
16831 return performSubCombine(
N, DCI);
16834 return performAddCarrySubCarryCombine(
N, DCI);
16836 return performFAddCombine(
N, DCI);
16838 return performFSubCombine(
N, DCI);
16840 return performFDivCombine(
N, DCI);
16842 return performFMulCombine(
N, DCI);
16844 return performSetCCCombine(
N, DCI);
16846 if (
auto Res = performSelectCombine(
N, DCI))
16851 case ISD::FMAXNUM_IEEE:
16852 case ISD::FMINNUM_IEEE:
16853 case ISD::FMAXIMUM:
16854 case ISD::FMINIMUM:
16855 case ISD::FMAXIMUMNUM:
16856 case ISD::FMINIMUMNUM:
16863 return performMinMaxCombine(
N, DCI);
16865 return performFMACombine(
N, DCI);
16867 return performAndCombine(
N, DCI);
16869 return performOrCombine(
N, DCI);
16872 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
16873 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16879 return performXorCombine(
N, DCI);
16881 return performZeroExtendCombine(
N, DCI);
16883 return performSignExtendInRegCombine(
N, DCI);
16885 return performClassCombine(
N, DCI);
16887 return performFCanonicalizeCombine(
N, DCI);
16889 return performRcpCombine(
N, DCI);
16904 return performUCharToFloatCombine(
N, DCI);
16906 return performFCopySignCombine(
N, DCI);
16911 return performCvtF32UByteNCombine(
N, DCI);
16913 return performFMed3Combine(
N, DCI);
16915 return performCvtPkRTZCombine(
N, DCI);
16917 return performClampCombine(
N, DCI);
16920 EVT VT =
N->getValueType(0);
16923 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16926 EVT EltVT = Src.getValueType();
16927 if (EltVT != MVT::i16)
16928 Src = DAG.
getNode(ISD::BITCAST, SL, MVT::i16, Src);
16931 return DAG.
getNode(ISD::BITCAST, SL, VT, Ext);
16937 return performExtractVectorEltCombine(
N, DCI);
16939 return performInsertVectorEltCombine(
N, DCI);
16941 return performFPRoundCombine(
N, DCI);
16950 return performMemSDNodeCombine(MemNode, DCI);
16981 unsigned Opcode =
Node->getMachineOpcode();
16984 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16985 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
16988 SDNode *
Users[5] = {
nullptr};
16990 unsigned DmaskIdx =
16991 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16992 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
16993 unsigned NewDmask = 0;
16994 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16995 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16996 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
16997 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
16998 unsigned TFCLane = 0;
16999 bool HasChain =
Node->getNumValues() > 1;
17001 if (OldDmask == 0) {
17009 TFCLane = OldBitsSet;
17013 for (SDUse &Use :
Node->uses()) {
17016 if (
Use.getResNo() != 0)
17019 SDNode *
User =
Use.getUser();
17022 if (!
User->isMachineOpcode() ||
17023 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17035 if (UsesTFC && Lane == TFCLane) {
17040 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17042 Dmask &= ~(1 << Comp);
17050 NewDmask |= 1 << Comp;
17055 bool NoChannels = !NewDmask;
17062 if (OldBitsSet == 1)
17068 if (NewDmask == OldDmask)
17077 unsigned NewChannels = BitsSet + UsesTFC;
17081 assert(NewOpcode != -1 &&
17082 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
17083 "failed to find equivalent MIMG op");
17091 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
17093 MVT ResultVT = NewChannels == 1
17096 : NewChannels == 5 ? 8
17098 SDVTList NewVTList =
17101 MachineSDNode *NewNode =
17110 if (NewChannels == 1) {
17120 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17125 if (i || !NoChannels)
17130 if (NewUser != User) {
17140 Idx = AMDGPU::sub1;
17143 Idx = AMDGPU::sub2;
17146 Idx = AMDGPU::sub3;
17149 Idx = AMDGPU::sub4;
17160 Op =
Op.getOperand(0);
17181 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17185 Node->getOperand(0), SL, VReg, SrcVal,
17191 return ToResultReg.
getNode();
17196 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
17198 Ops.push_back(
Node->getOperand(i));
17204 Node->getOperand(i).getValueType(),
17205 Node->getOperand(i)),
17217 unsigned Opcode =
Node->getMachineOpcode();
17219 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
17220 !
TII->isGather4(Opcode) &&
17222 return adjustWritemask(
Node, DAG);
17225 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17231 case AMDGPU::V_DIV_SCALE_F32_e64:
17232 case AMDGPU::V_DIV_SCALE_F64_e64: {
17242 (Src0 == Src1 || Src0 == Src2))
17298 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
17299 unsigned InitIdx = 0;
17301 if (
TII->isImage(
MI)) {
17309 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
17310 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
17311 unsigned D16Val = D16 ? D16->getImm() : 0;
17313 if (!TFEVal && !LWEVal)
17324 assert(MO_Dmask &&
"Expected dmask operand in instruction");
17326 unsigned dmask = MO_Dmask->
getImm();
17331 bool Packed = !Subtarget->hasUnpackedD16VMem();
17333 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17339 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
17340 if (DstSize < InitIdx)
17343 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
17351 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
17352 unsigned NewDst = 0;
17357 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17358 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17361 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17362 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
17382 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
17394 if (
TII->isVOP3(
MI.getOpcode())) {
17396 TII->legalizeOperandsVOP3(
MRI,
MI);
17398 if (
TII->isMAI(
MI)) {
17403 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17404 AMDGPU::OpName::scale_src0);
17405 if (Src0Idx != -1) {
17406 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17407 AMDGPU::OpName::scale_src1);
17408 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
17409 TII->usesConstantBus(
MRI,
MI, Src1Idx))
17410 TII->legalizeOpWithMove(
MI, Src1Idx);
17417 if (
TII->isImage(
MI))
17418 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
17492std::pair<unsigned, const TargetRegisterClass *>
17499 if (Constraint.
size() == 1) {
17503 if (VT == MVT::Other)
17506 switch (Constraint[0]) {
17513 RC = &AMDGPU::SReg_32RegClass;
17516 RC = &AMDGPU::SGPR_64RegClass;
17521 return std::pair(0U,
nullptr);
17528 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17529 : &AMDGPU::VGPR_32_Lo256RegClass;
17532 RC = Subtarget->has1024AddressableVGPRs()
17533 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
17536 return std::pair(0U,
nullptr);
17541 if (!Subtarget->hasMAIInsts())
17545 RC = &AMDGPU::AGPR_32RegClass;
17550 return std::pair(0U,
nullptr);
17555 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
17559 RC = &AMDGPU::AV_32RegClass;
17562 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
17564 return std::pair(0U,
nullptr);
17573 return std::pair(0U, RC);
17576 if (Kind !=
'\0') {
17578 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17579 }
else if (Kind ==
's') {
17580 RC = &AMDGPU::SGPR_32RegClass;
17581 }
else if (Kind ==
'a') {
17582 RC = &AMDGPU::AGPR_32RegClass;
17588 return std::pair(0U,
nullptr);
17594 return std::pair(0U,
nullptr);
17598 RC =
TRI->getVGPRClassForBitWidth(Width);
17600 RC =
TRI->getSGPRClassForBitWidth(Width);
17602 RC =
TRI->getAGPRClassForBitWidth(Width);
17604 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17609 return std::pair(0U,
nullptr);
17611 return std::pair(Reg, RC);
17617 return std::pair(0U,
nullptr);
17618 if (Idx < RC->getNumRegs())
17620 return std::pair(0U,
nullptr);
17626 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
17632 if (Constraint.
size() == 1) {
17633 switch (Constraint[0]) {
17643 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
17651 if (Constraint.
size() == 1) {
17652 switch (Constraint[0]) {
17660 }
else if (Constraint.
size() == 2) {
17661 if (Constraint ==
"VA")
17679 std::vector<SDValue> &
Ops,
17694 unsigned Size =
Op.getScalarValueSizeInBits();
17698 if (
Size == 16 && !Subtarget->has16BitInsts())
17702 Val =
C->getSExtValue();
17706 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17710 if (
Size != 16 ||
Op.getNumOperands() != 2)
17712 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
17715 Val =
C->getSExtValue();
17719 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17729 if (Constraint.
size() == 1) {
17730 switch (Constraint[0]) {
17745 }
else if (Constraint.
size() == 2) {
17746 if (Constraint ==
"DA") {
17747 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
17748 int64_t LoBits =
static_cast<int32_t
>(Val);
17752 if (Constraint ==
"DB") {
17760 unsigned MaxSize)
const {
17761 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
17762 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17764 MVT VT =
Op.getSimpleValueType();
17789 switch (UnalignedClassID) {
17790 case AMDGPU::VReg_64RegClassID:
17791 return AMDGPU::VReg_64_Align2RegClassID;
17792 case AMDGPU::VReg_96RegClassID:
17793 return AMDGPU::VReg_96_Align2RegClassID;
17794 case AMDGPU::VReg_128RegClassID:
17795 return AMDGPU::VReg_128_Align2RegClassID;
17796 case AMDGPU::VReg_160RegClassID:
17797 return AMDGPU::VReg_160_Align2RegClassID;
17798 case AMDGPU::VReg_192RegClassID:
17799 return AMDGPU::VReg_192_Align2RegClassID;
17800 case AMDGPU::VReg_224RegClassID:
17801 return AMDGPU::VReg_224_Align2RegClassID;
17802 case AMDGPU::VReg_256RegClassID:
17803 return AMDGPU::VReg_256_Align2RegClassID;
17804 case AMDGPU::VReg_288RegClassID:
17805 return AMDGPU::VReg_288_Align2RegClassID;
17806 case AMDGPU::VReg_320RegClassID:
17807 return AMDGPU::VReg_320_Align2RegClassID;
17808 case AMDGPU::VReg_352RegClassID:
17809 return AMDGPU::VReg_352_Align2RegClassID;
17810 case AMDGPU::VReg_384RegClassID:
17811 return AMDGPU::VReg_384_Align2RegClassID;
17812 case AMDGPU::VReg_512RegClassID:
17813 return AMDGPU::VReg_512_Align2RegClassID;
17814 case AMDGPU::VReg_1024RegClassID:
17815 return AMDGPU::VReg_1024_Align2RegClassID;
17816 case AMDGPU::AReg_64RegClassID:
17817 return AMDGPU::AReg_64_Align2RegClassID;
17818 case AMDGPU::AReg_96RegClassID:
17819 return AMDGPU::AReg_96_Align2RegClassID;
17820 case AMDGPU::AReg_128RegClassID:
17821 return AMDGPU::AReg_128_Align2RegClassID;
17822 case AMDGPU::AReg_160RegClassID:
17823 return AMDGPU::AReg_160_Align2RegClassID;
17824 case AMDGPU::AReg_192RegClassID:
17825 return AMDGPU::AReg_192_Align2RegClassID;
17826 case AMDGPU::AReg_256RegClassID:
17827 return AMDGPU::AReg_256_Align2RegClassID;
17828 case AMDGPU::AReg_512RegClassID:
17829 return AMDGPU::AReg_512_Align2RegClassID;
17830 case AMDGPU::AReg_1024RegClassID:
17831 return AMDGPU::AReg_1024_Align2RegClassID;
17847 if (Info->isEntryFunction()) {
17854 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17856 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17857 :
TRI->getAlignedHighSGPRForRC(MF, 2,
17858 &AMDGPU::SGPR_64RegClass);
17859 Info->setSGPRForEXECCopy(SReg);
17861 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
17862 Info->getStackPtrOffsetReg()));
17863 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17864 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17868 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17869 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17871 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17872 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17874 Info->limitOccupancy(MF);
17876 if (ST.isWave32() && !MF.
empty()) {
17877 for (
auto &
MBB : MF) {
17878 for (
auto &
MI :
MBB) {
17879 TII->fixImplicitOperands(
MI);
17889 if (ST.needsAlignedVGPRs()) {
17890 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
17896 if (NewClassID != -1)
17897 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
17906 const APInt &DemandedElts,
17908 unsigned Depth)
const {
17910 unsigned Opc =
Op.getOpcode();
17913 unsigned IID =
Op.getConstantOperandVal(0);
17915 case Intrinsic::amdgcn_mbcnt_lo:
17916 case Intrinsic::amdgcn_mbcnt_hi: {
17922 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17932 Op, Known, DemandedElts, DAG,
Depth);
17948 unsigned MaxValue =
17955 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
17959 unsigned Src1Cst = 0;
17960 if (Src1.
isImm()) {
17961 Src1Cst = Src1.
getImm();
17962 }
else if (Src1.
isReg()) {
17966 Src1Cst = Cst->Value.getZExtValue();
17977 if (Width >= BFEWidth)
17986 Known = Known.
sext(BFEWidth);
17988 Known = Known.
zext(BFEWidth);
17994 unsigned Depth)
const {
17997 switch (
MI->getOpcode()) {
17998 case AMDGPU::S_BFE_I32:
18001 case AMDGPU::S_BFE_U32:
18004 case AMDGPU::S_BFE_I64:
18007 case AMDGPU::S_BFE_U64:
18010 case AMDGPU::G_INTRINSIC:
18011 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18014 case Intrinsic::amdgcn_workitem_id_x:
18017 case Intrinsic::amdgcn_workitem_id_y:
18020 case Intrinsic::amdgcn_workitem_id_z:
18023 case Intrinsic::amdgcn_mbcnt_lo:
18024 case Intrinsic::amdgcn_mbcnt_hi: {
18036 case Intrinsic::amdgcn_groupstaticsize: {
18047 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18050 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18053 case AMDGPU::G_AMDGPU_SMED3:
18054 case AMDGPU::G_AMDGPU_UMED3: {
18055 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
18082 unsigned Depth)
const {
18089 AttributeList Attrs =
18091 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
18118 if (Header->getAlignment() != PrefAlign)
18119 return Header->getAlignment();
18121 unsigned LoopSize = 0;
18126 LoopSize +=
MBB->getAlignment().value() / 2;
18129 LoopSize +=
TII->getInstSizeInBytes(
MI);
18130 if (LoopSize > 192)
18135 if (LoopSize <= 64)
18138 if (LoopSize <= 128)
18139 return CacheLineAlign;
18145 auto I = Exit->getFirstNonDebugInstr();
18146 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18147 return CacheLineAlign;
18156 if (PreTerm == Pre->
begin() ||
18157 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18161 auto ExitHead = Exit->getFirstNonDebugInstr();
18162 if (ExitHead == Exit->end() ||
18163 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18168 return CacheLineAlign;
18176 N =
N->getOperand(0).getNode();
18177 if (
N->getOpcode() == ISD::INLINEASM ||
N->getOpcode() == ISD::INLINEASM_BR)
18186 switch (
N->getOpcode()) {
18194 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
18195 return !
TRI->isSGPRReg(
MRI, Reg);
18201 return !
TRI->isSGPRReg(
MRI, Reg);
18205 unsigned AS = L->getAddressSpace();
18209 case ISD::CALLSEQ_END:
18238 return A->readMem() &&
A->writeMem();
18259 switch (Ty.getScalarSizeInBits()) {
18271 const APInt &DemandedElts,
18274 unsigned Depth)
const {
18279 if (Info->getMode().DX10Clamp)
18291 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
18311 <<
"Hardware instruction generated for atomic "
18313 <<
" operation at memory scope " << MemScope;
18318 Type *EltTy = VT->getElementType();
18319 return VT->getNumElements() == 2 &&
18339 unsigned BW =
IT->getBitWidth();
18340 return BW == 32 || BW == 64;
18354 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
18355 return BW == 32 || BW == 64;
18358 if (Ty->isFloatTy() || Ty->isDoubleTy())
18362 return VT->getNumElements() == 2 &&
18363 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18373 bool HasSystemScope) {
18380 if (HasSystemScope) {
18389 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
18402 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
18428 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
18441 bool HasSystemScope =
18467 if (Subtarget->hasEmulatedSystemScopeAtomics())
18483 if (!HasSystemScope &&
18484 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18496 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
18504 ConstVal && ConstVal->isNullValue())
18542 if (Ty->isFloatTy()) {
18547 if (Ty->isDoubleTy()) {
18568 if (Ty->isFloatTy() &&
18569 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18582 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18586 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
18590 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18595 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
18600 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18604 if (Ty->isFloatTy()) {
18607 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18610 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18615 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18623 if (Subtarget->hasFlatAtomicFaddF32Inst())
18632 if (Subtarget->hasLDSFPAtomicAddF32()) {
18633 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18635 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18663 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18665 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18669 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18671 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18724 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18725 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18726 : &AMDGPU::SReg_32RegClass;
18727 if (!
TRI->isSGPRClass(RC) && !isDivergent)
18728 return TRI->getEquivalentSGPRClass(RC);
18729 if (
TRI->isSGPRClass(RC) && isDivergent)
18730 return TRI->getEquivalentVGPRClass(RC);
18742 unsigned WaveSize) {
18747 if (!
IT ||
IT->getBitWidth() != WaveSize)
18752 if (!Visited.
insert(V).second)
18754 bool Result =
false;
18755 for (
const auto *U : V->users()) {
18757 if (V == U->getOperand(1)) {
18762 case Intrinsic::amdgcn_if_break:
18763 case Intrinsic::amdgcn_if:
18764 case Intrinsic::amdgcn_else:
18769 if (V == U->getOperand(0)) {
18774 case Intrinsic::amdgcn_end_cf:
18775 case Intrinsic::amdgcn_loop:
18781 Result =
hasCFUser(U, Visited, WaveSize);
18790 const Value *V)
const {
18792 if (CI->isInlineAsm()) {
18801 for (
auto &TC : TargetConstraints) {
18815 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18843 return MRI.hasOneNonDBGUse(N0);
18850 if (
I.getMetadata(
"amdgpu.noclobber"))
18852 if (
I.getMetadata(
"amdgpu.last.use"))
18916 Alignment = RMW->getAlign();
18929 bool FullFlatEmulation =
18931 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18932 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18933 RMW->getType()->isDoubleTy()));
18936 bool ReturnValueIsUsed = !AI->
use_empty();
18945 if (FullFlatEmulation) {
18956 std::prev(BB->
end())->eraseFromParent();
18957 Builder.SetInsertPoint(BB);
18959 Value *LoadedShared =
nullptr;
18960 if (FullFlatEmulation) {
18961 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
18962 {Addr},
nullptr,
"is.shared");
18963 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
18964 Builder.SetInsertPoint(SharedBB);
18965 Value *CastToLocal = Builder.CreateAddrSpaceCast(
18971 LoadedShared = Clone;
18973 Builder.CreateBr(PhiBB);
18974 Builder.SetInsertPoint(CheckPrivateBB);
18977 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
18978 {Addr},
nullptr,
"is.private");
18979 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
18981 Builder.SetInsertPoint(PrivateBB);
18983 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
18986 Value *LoadedPrivate;
18988 LoadedPrivate = Builder.CreateAlignedLoad(
18989 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
18992 LoadedPrivate, RMW->getValOperand());
18994 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
18996 auto [ResultLoad, Equal] =
19002 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19005 Builder.CreateBr(PhiBB);
19007 Builder.SetInsertPoint(GlobalBB);
19011 if (FullFlatEmulation) {
19012 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19021 if (!FullFlatEmulation) {
19026 MDNode *RangeNotPrivate =
19029 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
19033 Builder.CreateBr(PhiBB);
19035 Builder.SetInsertPoint(PhiBB);
19037 if (ReturnValueIsUsed) {
19040 if (FullFlatEmulation)
19047 Builder.CreateBr(ExitBB);
19051 unsigned PtrOpIdx) {
19052 Value *PtrOp =
I->getOperand(PtrOpIdx);
19059 I->setOperand(PtrOpIdx, ASCast);
19071 ConstVal && ConstVal->isNullValue()) {
19101 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19109 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19124 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned AndSaveExecOpc
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
constexpr bool empty() const
empty - Check if the string is empty.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ TC_RETURN_GFX_WholeWave
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ SMULO
Same for multiplication.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
int popcount(T Value) noexcept
Count the number of set bits in a value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const