41#include "llvm/IR/IntrinsicsAMDGPU.h"
42#include "llvm/IR/IntrinsicsR600.h"
53#define DEBUG_TYPE "si-lower"
59 cl::desc(
"Do not align and prefetch loops"),
63 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
64 cl::desc(
"Use indirect register addressing for divergent indexes"),
71 cl::desc(
"Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
86 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
87 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
89 return AMDGPU::SGPR0 +
Reg;
161 if (Subtarget->has16BitInsts()) {
162 if (Subtarget->useRealTrue16Insts()) {
204 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
205 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
206 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
207 MVT::i1, MVT::v32i32},
211 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
212 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
213 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
214 MVT::i1, MVT::v32i32},
221 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
222 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
223 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
224 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
225 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
283 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
290 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
291 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
292 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
295 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
296 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
297 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
301 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
302 MVT::v3i16, MVT::v4i16, MVT::Other},
307 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
323 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
324 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
325 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
326 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
327 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
328 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
329 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
330 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
362 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
376 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
390 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
404 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
418 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
433 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
434 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
437 if (Subtarget->hasPkMovB32()) {
458 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
459 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
464 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
468 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
469 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
470 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
471 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
495 if (Subtarget->hasSMemRealTime() ||
500 if (Subtarget->has16BitInsts()) {
507 if (Subtarget->hasMadMacF32Insts())
510 if (!Subtarget->hasBFI())
514 if (!Subtarget->hasBCNT(32))
517 if (!Subtarget->hasBCNT(64))
520 if (Subtarget->hasFFBH())
523 if (Subtarget->hasFFBL())
534 if (Subtarget->hasBFE())
538 if (Subtarget->hasIntClamp())
541 if (Subtarget->hasAddNoCarry())
546 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
547 {MVT::f32, MVT::f64},
Custom);
553 {MVT::f32, MVT::f64},
Legal);
555 if (Subtarget->haveRoundOpsF64())
578 if (Subtarget->has16BitInsts()) {
627 ISD::FSIN, ISD::FROUND},
631 if (Subtarget->hasBF16TransInsts())
650 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
651 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
652 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
785 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
786 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
787 MVT::v32f16, MVT::v32bf16},
791 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
797 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
801 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
805 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
806 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
814 if (Subtarget->hasVOP3PInsts()) {
825 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
828 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
829 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
830 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
833 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
841 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
847 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
848 {MVT::v2f16, MVT::v4f16},
Custom);
854 if (Subtarget->hasPackedFP32Ops()) {
858 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
865 if (Subtarget->has16BitInsts()) {
878 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
879 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
880 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
881 MVT::v32f16, MVT::v32bf16},
886 if (Subtarget->hasVectorMulU64())
888 else if (Subtarget->hasScalarSMulU64())
891 if (Subtarget->hasMad64_32())
894 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
897 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
899 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
902 if (Subtarget->hasMinimum3Maximum3F32())
905 if (Subtarget->hasMinimum3Maximum3PKF16()) {
909 if (!Subtarget->hasMinimum3Maximum3F16())
914 if (Subtarget->hasVOP3PInsts()) {
917 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
921 if (Subtarget->hasIntMinMax64())
926 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
927 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
932 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
933 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
934 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
935 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
939 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
940 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
941 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
942 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
957 if (Subtarget->hasBF16ConversionInsts()) {
962 if (Subtarget->hasBF16PackedInsts()) {
968 if (Subtarget->hasBF16TransInsts()) {
972 if (Subtarget->hasCvtPkF16F32Inst()) {
974 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1024 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1033 ISD::ATOMIC_CMP_SWAP,
1034 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1036 ISD::ATOMIC_LOAD_ADD,
1037 ISD::ATOMIC_LOAD_SUB,
1038 ISD::ATOMIC_LOAD_AND,
1039 ISD::ATOMIC_LOAD_OR,
1040 ISD::ATOMIC_LOAD_XOR,
1041 ISD::ATOMIC_LOAD_NAND,
1042 ISD::ATOMIC_LOAD_MIN,
1043 ISD::ATOMIC_LOAD_MAX,
1044 ISD::ATOMIC_LOAD_UMIN,
1045 ISD::ATOMIC_LOAD_UMAX,
1046 ISD::ATOMIC_LOAD_FADD,
1047 ISD::ATOMIC_LOAD_FMIN,
1048 ISD::ATOMIC_LOAD_FMAX,
1049 ISD::ATOMIC_LOAD_UINC_WRAP,
1050 ISD::ATOMIC_LOAD_UDEC_WRAP,
1063 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1076 EVT DestVT,
EVT SrcVT)
const {
1078 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1079 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1081 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1088 LLT DestTy,
LLT SrcTy)
const {
1089 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1090 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1092 SrcTy.getScalarSizeInBits() == 16 &&
1113 if (Subtarget->has16BitInsts()) {
1116 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1118 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1122 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1144 if (
Size == 16 && Subtarget->has16BitInsts())
1145 return (NumElts + 1) / 2;
1151 return NumElts * ((
Size + 31) / 32);
1160 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1168 if (
Size == 16 && Subtarget->has16BitInsts()) {
1169 if (ScalarVT == MVT::bf16) {
1170 RegisterVT = MVT::i32;
1171 IntermediateVT = MVT::v2bf16;
1173 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1174 IntermediateVT = RegisterVT;
1176 NumIntermediates = (NumElts + 1) / 2;
1177 return NumIntermediates;
1182 IntermediateVT = RegisterVT;
1183 NumIntermediates = NumElts;
1184 return NumIntermediates;
1189 RegisterVT = MVT::i16;
1190 IntermediateVT = ScalarVT;
1191 NumIntermediates = NumElts;
1192 return NumIntermediates;
1196 RegisterVT = MVT::i32;
1197 IntermediateVT = ScalarVT;
1198 NumIntermediates = NumElts;
1199 return NumIntermediates;
1203 RegisterVT = MVT::i32;
1204 IntermediateVT = RegisterVT;
1205 NumIntermediates = NumElts * ((
Size + 31) / 32);
1206 return NumIntermediates;
1211 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1216 unsigned MaxNumLanes) {
1217 assert(MaxNumLanes != 0);
1221 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1232 unsigned MaxNumLanes) {
1238 assert(ST->getNumContainedTypes() == 2 &&
1239 ST->getContainedType(1)->isIntegerTy(32));
1253 return MVT::amdgpuBufferFatPointer;
1255 DL.getPointerSizeInBits(AS) == 192)
1256 return MVT::amdgpuBufferStridedPointer;
1265 DL.getPointerSizeInBits(AS) == 160) ||
1267 DL.getPointerSizeInBits(AS) == 192))
1274 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1275 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1276 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1278 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1279 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1280 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1281 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1282 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1284 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1285 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1286 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1287 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1288 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1290 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1291 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1292 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1293 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1294 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1333 unsigned IntrID)
const {
1335 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1353 if (RsrcIntr->IsImage) {
1368 Info.ptrVal = RsrcArg;
1371 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1380 if (RsrcIntr->IsImage) {
1381 unsigned MaxNumLanes = 4;
1396 std::numeric_limits<unsigned>::max());
1406 if (RsrcIntr->IsImage) {
1427 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1429 Info.memVT = MVT::i32;
1436 case Intrinsic::amdgcn_raw_buffer_load_lds:
1437 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1438 case Intrinsic::amdgcn_struct_buffer_load_lds:
1439 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1445 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1446 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1447 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1448 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1451 std::numeric_limits<unsigned>::max());
1461 case Intrinsic::amdgcn_ds_ordered_add:
1462 case Intrinsic::amdgcn_ds_ordered_swap: {
1475 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1476 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1479 Info.ptrVal =
nullptr;
1484 case Intrinsic::amdgcn_ds_append:
1485 case Intrinsic::amdgcn_ds_consume: {
1498 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1499 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1500 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1505 Info.memVT = MVT::i64;
1511 case Intrinsic::amdgcn_global_atomic_csub: {
1520 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1521 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1522 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1525 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1528 ->getElementType(0));
1536 case Intrinsic::amdgcn_global_atomic_fmin_num:
1537 case Intrinsic::amdgcn_global_atomic_fmax_num:
1538 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1539 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1540 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1541 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1551 case Intrinsic::amdgcn_flat_load_monitor_b32:
1552 case Intrinsic::amdgcn_flat_load_monitor_b64:
1553 case Intrinsic::amdgcn_flat_load_monitor_b128:
1554 case Intrinsic::amdgcn_global_load_monitor_b32:
1555 case Intrinsic::amdgcn_global_load_monitor_b64:
1556 case Intrinsic::amdgcn_global_load_monitor_b128:
1557 case Intrinsic::amdgcn_cluster_load_b32:
1558 case Intrinsic::amdgcn_cluster_load_b64:
1559 case Intrinsic::amdgcn_cluster_load_b128:
1560 case Intrinsic::amdgcn_ds_load_tr6_b96:
1561 case Intrinsic::amdgcn_ds_load_tr4_b64:
1562 case Intrinsic::amdgcn_ds_load_tr8_b64:
1563 case Intrinsic::amdgcn_ds_load_tr16_b128:
1564 case Intrinsic::amdgcn_global_load_tr6_b96:
1565 case Intrinsic::amdgcn_global_load_tr4_b64:
1566 case Intrinsic::amdgcn_global_load_tr_b64:
1567 case Intrinsic::amdgcn_global_load_tr_b128:
1568 case Intrinsic::amdgcn_ds_read_tr4_b64:
1569 case Intrinsic::amdgcn_ds_read_tr6_b96:
1570 case Intrinsic::amdgcn_ds_read_tr8_b64:
1571 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1579 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1580 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1581 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1589 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1590 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1591 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1599 case Intrinsic::amdgcn_ds_gws_init:
1600 case Intrinsic::amdgcn_ds_gws_barrier:
1601 case Intrinsic::amdgcn_ds_gws_sema_v:
1602 case Intrinsic::amdgcn_ds_gws_sema_br:
1603 case Intrinsic::amdgcn_ds_gws_sema_p:
1604 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1614 Info.memVT = MVT::i32;
1616 Info.align =
Align(4);
1618 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1624 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1625 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1626 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1627 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1628 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1629 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1630 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1631 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1638 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1639 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1640 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1641 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1648 case Intrinsic::amdgcn_load_to_lds:
1649 case Intrinsic::amdgcn_global_load_lds: {
1657 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1658 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1659 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1660 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1670 Info.memVT = MVT::i32;
1672 Info.align =
Align(4);
1677 case Intrinsic::amdgcn_s_prefetch_data:
1678 case Intrinsic::amdgcn_flat_prefetch:
1679 case Intrinsic::amdgcn_global_prefetch: {
1694 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1697 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1698 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1710 Type *&AccessTy)
const {
1712 switch (
II->getIntrinsicID()) {
1713 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1714 case Intrinsic::amdgcn_cluster_load_b128:
1715 case Intrinsic::amdgcn_cluster_load_b64:
1716 case Intrinsic::amdgcn_cluster_load_b32:
1717 case Intrinsic::amdgcn_ds_append:
1718 case Intrinsic::amdgcn_ds_consume:
1719 case Intrinsic::amdgcn_ds_load_tr8_b64:
1720 case Intrinsic::amdgcn_ds_load_tr16_b128:
1721 case Intrinsic::amdgcn_ds_load_tr4_b64:
1722 case Intrinsic::amdgcn_ds_load_tr6_b96:
1723 case Intrinsic::amdgcn_ds_read_tr4_b64:
1724 case Intrinsic::amdgcn_ds_read_tr6_b96:
1725 case Intrinsic::amdgcn_ds_read_tr8_b64:
1726 case Intrinsic::amdgcn_ds_read_tr16_b64:
1727 case Intrinsic::amdgcn_ds_ordered_add:
1728 case Intrinsic::amdgcn_ds_ordered_swap:
1729 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1730 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1731 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1732 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1733 case Intrinsic::amdgcn_flat_load_monitor_b128:
1734 case Intrinsic::amdgcn_flat_load_monitor_b32:
1735 case Intrinsic::amdgcn_flat_load_monitor_b64:
1736 case Intrinsic::amdgcn_global_atomic_csub:
1737 case Intrinsic::amdgcn_global_atomic_fmax_num:
1738 case Intrinsic::amdgcn_global_atomic_fmin_num:
1739 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1740 case Intrinsic::amdgcn_global_load_monitor_b128:
1741 case Intrinsic::amdgcn_global_load_monitor_b32:
1742 case Intrinsic::amdgcn_global_load_monitor_b64:
1743 case Intrinsic::amdgcn_global_load_tr_b64:
1744 case Intrinsic::amdgcn_global_load_tr_b128:
1745 case Intrinsic::amdgcn_global_load_tr4_b64:
1746 case Intrinsic::amdgcn_global_load_tr6_b96:
1747 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1748 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1749 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1750 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1751 Ptr =
II->getArgOperand(0);
1753 case Intrinsic::amdgcn_load_to_lds:
1754 case Intrinsic::amdgcn_global_load_lds:
1755 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1756 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1757 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1758 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1759 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1760 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1761 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1762 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1763 Ptr =
II->getArgOperand(1);
1768 AccessTy =
II->getType();
1774 unsigned AddrSpace)
const {
1775 if (!Subtarget->hasFlatInstOffsets()) {
1786 return AM.
Scale == 0 &&
1787 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1788 AM.
BaseOffs, AddrSpace, FlatVariant));
1792 if (Subtarget->hasFlatGlobalInsts())
1795 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1808 return isLegalMUBUFAddressingMode(AM);
1811bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1822 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1834 if (AM.HasBaseReg) {
1866 return isLegalMUBUFAddressingMode(AM);
1868 if (!Subtarget->hasScalarSubwordLoads()) {
1873 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1921 return Subtarget->enableFlatScratch()
1923 : isLegalMUBUFAddressingMode(AM);
1970 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1979 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
1982 Align RequiredAlignment(
1984 if (Subtarget->hasLDSMisalignedBug() &&
Size > 32 &&
1985 Alignment < RequiredAlignment)
2000 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
2006 RequiredAlignment =
Align(4);
2008 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2024 *IsFast = (Alignment >= RequiredAlignment) ? 64
2025 : (Alignment <
Align(4)) ? 32
2032 if (!Subtarget->hasDS96AndDS128())
2038 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2047 *IsFast = (Alignment >= RequiredAlignment) ? 96
2048 : (Alignment <
Align(4)) ? 32
2055 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2061 RequiredAlignment =
Align(8);
2063 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2072 *IsFast = (Alignment >= RequiredAlignment) ? 128
2073 : (Alignment <
Align(4)) ? 32
2090 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2092 return Alignment >= RequiredAlignment ||
2093 Subtarget->hasUnalignedDSAccessEnabled();
2101 bool AlignedBy4 = Alignment >=
Align(4);
2102 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2104 *IsFast = AlignedBy4 ?
Size : 1;
2109 *IsFast = AlignedBy4;
2120 return Alignment >=
Align(4) ||
2121 Subtarget->hasUnalignedBufferAccessEnabled();
2133 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2148 return Size >= 32 && Alignment >=
Align(4);
2153 unsigned *IsFast)
const {
2155 Alignment, Flags, IsFast);
2160 const AttributeList &FuncAttributes)
const {
2166 if (
Op.size() >= 16 &&
2170 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2188 unsigned DestAS)
const {
2191 Subtarget->hasGloballyAddressableScratch()) {
2221 unsigned Index)
const {
2237 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2264 auto [InputPtrReg, RC, ArgTy] =
2274 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2280 const SDLoc &SL)
const {
2287 const SDLoc &SL)
const {
2290 std::optional<uint32_t> KnownSize =
2292 if (KnownSize.has_value())
2318 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2327SDValue SITargetLowering::lowerKernargMemParameter(
2339 int64_t OffsetDiff =
Offset - AlignDownOffset;
2345 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2354 ArgVal = DAG.
getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2355 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2365 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2374 const SDLoc &SL)
const {
2384 return DAG.
getNode(ISD::BITCAST, SL, ValVT, Val);
2443 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2446 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2447 if (ConvertedVal == ArgValue)
2448 return ConvertedVal;
2453SDValue SITargetLowering::lowerWorkGroupId(
2458 if (!Subtarget->hasClusters())
2459 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2467 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2468 SDLoc SL(ClusterIdXYZ);
2469 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2472 SDValue ClusterWorkGroupIdXYZ =
2473 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2483 return ClusterIdXYZ;
2485 using namespace AMDGPU::Hwreg;
2489 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2500SDValue SITargetLowering::getPreloadedValue(
2503 const ArgDescriptor *
Reg =
nullptr;
2504 const TargetRegisterClass *RC;
2508 const ArgDescriptor WorkGroupIDX =
2516 const ArgDescriptor WorkGroupIDZ =
2518 const ArgDescriptor ClusterWorkGroupIDX =
2520 const ArgDescriptor ClusterWorkGroupIDY =
2522 const ArgDescriptor ClusterWorkGroupIDZ =
2524 const ArgDescriptor ClusterWorkGroupMaxIDX =
2526 const ArgDescriptor ClusterWorkGroupMaxIDY =
2528 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2530 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2533 auto LoadConstant = [&](
unsigned N) {
2537 if (Subtarget->hasArchitectedSGPRs() &&
2544 Reg = &WorkGroupIDX;
2545 RC = &AMDGPU::SReg_32RegClass;
2549 Reg = &WorkGroupIDY;
2550 RC = &AMDGPU::SReg_32RegClass;
2554 Reg = &WorkGroupIDZ;
2555 RC = &AMDGPU::SReg_32RegClass;
2559 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2560 return LoadConstant(0);
2561 Reg = &ClusterWorkGroupIDX;
2562 RC = &AMDGPU::SReg_32RegClass;
2566 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2567 return LoadConstant(0);
2568 Reg = &ClusterWorkGroupIDY;
2569 RC = &AMDGPU::SReg_32RegClass;
2573 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2574 return LoadConstant(0);
2575 Reg = &ClusterWorkGroupIDZ;
2576 RC = &AMDGPU::SReg_32RegClass;
2581 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2582 Reg = &ClusterWorkGroupMaxIDX;
2583 RC = &AMDGPU::SReg_32RegClass;
2588 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2589 Reg = &ClusterWorkGroupMaxIDY;
2590 RC = &AMDGPU::SReg_32RegClass;
2595 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2596 Reg = &ClusterWorkGroupMaxIDZ;
2597 RC = &AMDGPU::SReg_32RegClass;
2601 Reg = &ClusterWorkGroupMaxFlatID;
2602 RC = &AMDGPU::SReg_32RegClass;
2633 for (
unsigned I = 0,
E = Ins.size(), PSInputNum = 0;
I !=
E; ++
I) {
2637 "vector type argument should have been split");
2642 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2650 "unexpected vector split in ps argument type");
2664 Info->markPSInputAllocated(PSInputNum);
2666 Info->markPSInputEnabled(PSInputNum);
2682 if (Info.hasWorkItemIDX()) {
2688 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2692 if (Info.hasWorkItemIDY()) {
2693 assert(Info.hasWorkItemIDX());
2694 if (Subtarget->hasPackedTID()) {
2695 Info.setWorkItemIDY(
2698 unsigned Reg = AMDGPU::VGPR1;
2706 if (Info.hasWorkItemIDZ()) {
2707 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2708 if (Subtarget->hasPackedTID()) {
2709 Info.setWorkItemIDZ(
2712 unsigned Reg = AMDGPU::VGPR2;
2732 if (RegIdx == ArgVGPRs.
size()) {
2739 unsigned Reg = ArgVGPRs[RegIdx];
2751 unsigned NumArgRegs) {
2754 if (RegIdx == ArgSGPRs.
size())
2757 unsigned Reg = ArgSGPRs[RegIdx];
2799 const unsigned Mask = 0x3ff;
2802 if (Info.hasWorkItemIDX()) {
2804 Info.setWorkItemIDX(Arg);
2807 if (Info.hasWorkItemIDY()) {
2809 Info.setWorkItemIDY(Arg);
2812 if (Info.hasWorkItemIDZ())
2824 const unsigned Mask = 0x3ff;
2833 auto &
ArgInfo = Info.getArgInfo();
2845 if (Info.hasImplicitArgPtr())
2853 if (Info.hasWorkGroupIDX())
2856 if (Info.hasWorkGroupIDY())
2859 if (Info.hasWorkGroupIDZ())
2862 if (Info.hasLDSKernelId())
2873 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2874 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2880 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2881 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2886 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2887 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2893 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2899 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
2908 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2913 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
2914 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2919 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
2920 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2935 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2937 bool InPreloadSequence =
true;
2939 bool AlignedForImplictArgs =
false;
2940 unsigned ImplicitArgOffset = 0;
2941 for (
auto &Arg :
F.args()) {
2942 if (!InPreloadSequence || !Arg.hasInRegAttr())
2945 unsigned ArgIdx = Arg.getArgNo();
2948 if (InIdx < Ins.size() &&
2949 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2952 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2953 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2955 assert(ArgLocs[ArgIdx].isMemLoc());
2956 auto &ArgLoc = ArgLocs[InIdx];
2958 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2960 unsigned NumAllocSGPRs =
2961 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2964 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2965 if (!AlignedForImplictArgs) {
2967 alignTo(LastExplicitArgOffset,
2968 Subtarget->getAlignmentForImplicitArgPtr()) -
2969 LastExplicitArgOffset;
2970 AlignedForImplictArgs =
true;
2972 ArgOffset += ImplicitArgOffset;
2976 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2977 assert(InIdx >= 1 &&
"No previous SGPR");
2978 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2979 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2983 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2984 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2987 InPreloadSequence =
false;
2993 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2995 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2997 if (PreloadRegs->
size() > 1)
2998 RC = &AMDGPU::SGPR_32RegClass;
2999 for (
auto &Reg : *PreloadRegs) {
3005 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3014 if (Info.hasLDSKernelId()) {
3015 Register Reg = Info.addLDSKernelId();
3016 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3025 bool IsShader)
const {
3026 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3027 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3033 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3035 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3039 unsigned NumRequiredSystemSGPRs =
3040 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3041 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3042 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3043 Register Reg = Info.addReservedUserSGPR();
3044 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3049 if (!HasArchitectedSGPRs) {
3050 if (Info.hasWorkGroupIDX()) {
3051 Register Reg = Info.addWorkGroupIDX();
3052 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3056 if (Info.hasWorkGroupIDY()) {
3057 Register Reg = Info.addWorkGroupIDY();
3058 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3062 if (Info.hasWorkGroupIDZ()) {
3063 Register Reg = Info.addWorkGroupIDZ();
3064 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3069 if (Info.hasWorkGroupInfo()) {
3070 Register Reg = Info.addWorkGroupInfo();
3071 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3075 if (Info.hasPrivateSegmentWaveByteOffset()) {
3077 unsigned PrivateSegmentWaveByteOffsetReg;
3080 PrivateSegmentWaveByteOffsetReg =
3081 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3085 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3087 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3090 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3092 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3093 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3096 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3097 Info.getNumPreloadedSGPRs() >= 16);
3112 if (HasStackObjects)
3113 Info.setHasNonSpillStackObjects(
true);
3118 HasStackObjects =
true;
3122 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3124 if (!ST.enableFlatScratch()) {
3125 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3132 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3134 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3144 Info.setScratchRSrcReg(ReservedBufferReg);
3163 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
3164 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3171 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3172 if (!
MRI.isLiveIn(
Reg)) {
3173 Info.setStackPtrOffsetReg(
Reg);
3178 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3185 if (ST.getFrameLowering()->hasFP(MF)) {
3186 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3202 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3211 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3212 RC = &AMDGPU::SGPR_64RegClass;
3213 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3214 RC = &AMDGPU::SGPR_32RegClass;
3220 Entry->addLiveIn(*
I);
3225 for (
auto *Exit : Exits)
3227 TII->get(TargetOpcode::COPY), *
I)
3242 bool IsError =
false;
3246 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3264 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3265 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3267 if (!Subtarget->enableFlatScratch())
3272 !Subtarget->hasArchitectedSGPRs())
3273 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3274 !Info->hasWorkGroupIDZ());
3277 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3295 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3296 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3299 Info->markPSInputAllocated(0);
3300 Info->markPSInputEnabled(0);
3302 if (Subtarget->isAmdPalOS()) {
3311 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3312 if ((PsInputBits & 0x7F) == 0 ||
3313 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3316 }
else if (IsKernel) {
3317 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3319 Splits.
append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3329 if (IsKernel && Subtarget->hasKernargPreload())
3333 }
else if (!IsGraphics) {
3338 if (!Subtarget->enableFlatScratch())
3350 Info->setNumWaveDispatchSGPRs(
3352 Info->setNumWaveDispatchVGPRs(
3354 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3355 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3360 if (IsWholeWaveFunc) {
3362 {MVT::i1, MVT::Other}, Chain);
3374 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3385 if (IsEntryFunc && VA.
isMemLoc()) {
3408 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3412 int64_t OffsetDiff =
Offset - AlignDownOffset;
3419 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3429 ArgVal = DAG.
getNode(ISD::BITCAST,
DL, MemVT, ArgVal);
3430 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3431 Ins[i].Flags.isSExt(), &Ins[i]);
3439 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3442 if (PreloadRegs.
size() == 1) {
3443 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3448 TRI->getRegSizeInBits(*RC)));
3456 for (
auto Reg : PreloadRegs) {
3463 PreloadRegs.size()),
3480 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3481 Ins[i].Flags.isSExt(), &Ins[i]);
3493 "hidden argument in kernel signature was not preloaded",
3499 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3500 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3520 if (!IsEntryFunc && VA.
isMemLoc()) {
3521 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3532 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3533 RC = &AMDGPU::VGPR_32RegClass;
3534 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3535 RC = &AMDGPU::SGPR_32RegClass;
3555 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3571 Info->setBytesInStackArgArea(StackArgSize);
3573 return Chains.
empty() ? Chain
3582 const Type *RetTy)
const {
3590 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3595 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3596 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3597 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3598 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3621 Info->setIfReturnsVoid(Outs.
empty());
3622 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3641 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3642 ++
I, ++RealRVLocIdx) {
3646 SDValue Arg = OutVals[RealRVLocIdx];
3669 ReadFirstLane, Arg);
3676 if (!Info->isEntryFunction()) {
3682 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3684 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3785 auto &ArgUsageInfo =
3787 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3815 const auto [OutgoingArg, ArgRC, ArgTy] =
3820 const auto [IncomingArg, IncomingArgRC, Ty] =
3822 assert(IncomingArgRC == ArgRC);
3825 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3833 InputReg = getImplicitArgPtr(DAG,
DL);
3835 std::optional<uint32_t> Id =
3837 if (Id.has_value()) {
3848 if (OutgoingArg->isRegister()) {
3849 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3850 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3853 unsigned SpecialArgOffset =
3864 auto [OutgoingArg, ArgRC, Ty] =
3867 std::tie(OutgoingArg, ArgRC, Ty) =
3870 std::tie(OutgoingArg, ArgRC, Ty) =
3885 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3886 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3887 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3892 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3900 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
3910 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
3919 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3920 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3931 : IncomingArgY ? *IncomingArgY
3938 if (OutgoingArg->isRegister()) {
3940 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3966 if (Callee->isDivergent())
3973 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3977 if (!CallerPreserved)
3980 bool CCMatch = CallerCC == CalleeCC;
3993 if (Arg.hasByValAttr())
4007 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
4008 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4017 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4030 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4032 if (!CCVA.isRegLoc())
4037 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4039 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4063enum ChainCallArgIdx {
4085 bool UsesDynamicVGPRs =
false;
4086 if (IsChainCallConv) {
4091 auto RequestedExecIt =
4093 return Arg.OrigArgIndex == 2;
4095 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4097 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4100 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4103 "Haven't popped all the special args");
4106 CLI.
Args[ChainCallArgIdx::Exec];
4107 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4115 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4117 ChainCallSpecialArgs.
push_back(Arg.Node);
4120 PushNodeOrTargetConstant(RequestedExecArg);
4126 if (FlagsValue.
isZero()) {
4127 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4129 "no additional args allowed if flags == 0");
4131 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4135 if (!Subtarget->isWave32()) {
4137 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4140 UsesDynamicVGPRs =
true;
4141 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4142 CLI.
Args.end(), PushNodeOrTargetConstant);
4151 bool IsSibCall =
false;
4165 "unsupported call to variadic function ");
4173 "unsupported required tail call to function ");
4178 Outs, OutVals, Ins, DAG);
4182 "site marked musttail or on llvm.amdgcn.cs.chain");
4189 if (!TailCallOpt && IsTailCall)
4229 auto *
TRI = Subtarget->getRegisterInfo();
4236 if (!IsSibCall || IsChainCallConv) {
4237 if (!Subtarget->enableFlatScratch()) {
4243 RegsToPass.emplace_back(IsChainCallConv
4244 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4245 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4252 const unsigned NumSpecialInputs = RegsToPass.size();
4254 MVT PtrVT = MVT::i32;
4257 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4285 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4293 int32_t
Offset = LocMemOffset;
4300 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4306 ? Flags.getNonZeroByValAlign()
4333 if (Outs[i].Flags.isByVal()) {
4335 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4338 Outs[i].Flags.getNonZeroByValAlign(),
4340 nullptr, std::nullopt, DstInfo,
4346 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4352 if (!MemOpChains.
empty())
4360 TokenGlue = DAG.
getNode(ISD::CONVERGENCECTRL_GLUE,
DL, MVT::Glue,
4368 unsigned ArgIdx = 0;
4369 for (
auto [Reg, Val] : RegsToPass) {
4370 if (ArgIdx++ >= NumSpecialInputs &&
4371 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4397 if (IsTailCall && !IsSibCall) {
4402 std::vector<SDValue>
Ops({Chain});
4408 Ops.push_back(Callee);
4425 Ops.push_back(Callee);
4436 if (IsChainCallConv)
4441 for (
auto &[Reg, Val] : RegsToPass)
4445 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4446 assert(Mask &&
"Missing call preserved mask for calling convention");
4456 MVT::Glue, GlueOps),
4461 Ops.push_back(InGlue);
4481 if (Info->isWholeWaveFunction())
4489 Chain =
Call.getValue(0);
4490 InGlue =
Call.getValue(1);
4492 uint64_t CalleePopBytes = NumBytes;
4513 EVT VT =
Op.getValueType();
4527 "Stack grows upwards for AMDGPU");
4529 Chain = BaseAddr.getValue(1);
4531 if (Alignment > StackAlign) {
4533 << Subtarget->getWavefrontSizeLog2();
4534 uint64_t StackAlignMask = ScaledAlignment - 1;
4541 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4547 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4558 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4574 if (
Op.getValueType() != MVT::i32)
4593 assert(
Op.getValueType() == MVT::i32);
4602 Op.getOperand(0), IntrinID, GetRoundBothImm);
4636 SDValue RoundModeTimesNumBits =
4656 TableEntry, EnumOffset);
4672 static_cast<uint32_t>(ConstMode->getZExtValue()),
4684 if (UseReducedTable) {
4690 SDValue RoundModeTimesNumBits =
4710 SDValue RoundModeTimesNumBits =
4719 NewMode = TruncTable;
4728 ReadFirstLaneID, NewMode);
4741 IntrinID, RoundBothImm, NewMode);
4747 if (
Op->isDivergent() &&
4748 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4758 if (Subtarget->hasSafeSmemPrefetch())
4766 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4775 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4776 EVT SrcVT = Src.getValueType();
4785 EVT DstVT =
Op.getValueType();
4789 return DAG.
getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4794 if (
Op.getValueType() != MVT::i64)
4808 Op.getOperand(0), IntrinID, ModeHwRegImm);
4810 Op.getOperand(0), IntrinID, TrapHwRegImm);
4817 SDValue Result = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4824 if (
Op.getOperand(1).getValueType() != MVT::i64)
4836 ReadFirstLaneID, NewModeReg);
4838 ReadFirstLaneID, NewTrapReg);
4840 unsigned ModeHwReg =
4843 unsigned TrapHwReg =
4851 IntrinID, ModeHwRegImm, NewModeReg);
4854 IntrinID, TrapHwRegImm, NewTrapReg);
4863 .
Case(
"m0", AMDGPU::M0)
4864 .
Case(
"exec", AMDGPU::EXEC)
4865 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4866 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4867 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4868 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4869 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4874 if (!Subtarget->hasFlatScrRegister() &&
4875 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4877 "\" for subtarget."));
4882 case AMDGPU::EXEC_LO:
4883 case AMDGPU::EXEC_HI:
4884 case AMDGPU::FLAT_SCR_LO:
4885 case AMDGPU::FLAT_SCR_HI:
4890 case AMDGPU::FLAT_SCR:
4909 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4918static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4940 auto Next = std::next(
I);
4951 MBB.addSuccessor(LoopBB);
4953 return std::pair(LoopBB, RemainderBB);
4960 auto I =
MI.getIterator();
4961 auto E = std::next(
I);
4983 Src->setIsKill(
false);
4993 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
4999 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5002 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5026 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5027 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5037 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
5038 Register NewExec =
MRI.createVirtualRegister(BoolRC);
5040 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5041 Register CondReg =
MRI.createVirtualRegister(BoolRC);
5049 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5056 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5060 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5068 MRI.setSimpleHint(NewExec, CondReg);
5070 if (UseGPRIdxMode) {
5072 SGPRIdxReg = CurrentIdxReg;
5074 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5075 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5085 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5116 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5117 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5125 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5127 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
5128 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
5144 InitResultReg, DstReg, PhiReg, TmpExec,
5145 Offset, UseGPRIdxMode, SGPRIdxReg);
5151 LoopBB->removeSuccessor(RemainderBB);
5153 LoopBB->addSuccessor(LandingPad);
5164static std::pair<unsigned, int>
5168 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5173 return std::pair(AMDGPU::sub0,
Offset);
5213 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5230 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5231 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5240 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5243 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5247 if (UseGPRIdxMode) {
5254 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5267 MI.eraseFromParent();
5276 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5277 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5283 UseGPRIdxMode, SGPRIdxReg);
5287 if (UseGPRIdxMode) {
5289 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5291 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5296 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5301 MI.eraseFromParent();
5318 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5328 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5330 if (Idx->
getReg() == AMDGPU::NoRegister) {
5341 MI.eraseFromParent();
5346 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5350 if (UseGPRIdxMode) {
5354 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5363 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5364 TRI.getRegSizeInBits(*VecRC), 32,
false);
5370 MI.eraseFromParent();
5380 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5384 UseGPRIdxMode, SGPRIdxReg);
5387 if (UseGPRIdxMode) {
5389 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5391 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5397 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5398 TRI.getRegSizeInBits(*VecRC), 32,
false);
5399 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5405 MI.eraseFromParent();
5421 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5422 if (ST.hasScalarAddSub64()) {
5423 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5433 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5434 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5437 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5439 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5442 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5444 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5446 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5447 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5456 MI.eraseFromParent();
5462 case AMDGPU::S_MIN_U32:
5463 return std::numeric_limits<uint32_t>::max();
5464 case AMDGPU::S_MIN_I32:
5465 return std::numeric_limits<int32_t>::max();
5466 case AMDGPU::S_MAX_U32:
5467 return std::numeric_limits<uint32_t>::min();
5468 case AMDGPU::S_MAX_I32:
5469 return std::numeric_limits<int32_t>::min();
5470 case AMDGPU::S_ADD_I32:
5471 case AMDGPU::S_SUB_I32:
5472 case AMDGPU::S_OR_B32:
5473 case AMDGPU::S_XOR_B32:
5474 return std::numeric_limits<uint32_t>::min();
5475 case AMDGPU::S_AND_B32:
5476 return std::numeric_limits<uint32_t>::max();
5479 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5485 case AMDGPU::V_CMP_LT_U64_e64:
5486 return std::numeric_limits<uint64_t>::max();
5487 case AMDGPU::V_CMP_LT_I64_e64:
5488 return std::numeric_limits<int64_t>::max();
5489 case AMDGPU::V_CMP_GT_U64_e64:
5490 return std::numeric_limits<uint64_t>::min();
5491 case AMDGPU::V_CMP_GT_I64_e64:
5492 return std::numeric_limits<int64_t>::min();
5493 case AMDGPU::S_ADD_U64_PSEUDO:
5494 case AMDGPU::S_SUB_U64_PSEUDO:
5495 case AMDGPU::S_OR_B64:
5496 case AMDGPU::S_XOR_B64:
5497 return std::numeric_limits<uint64_t>::min();
5498 case AMDGPU::S_AND_B64:
5499 return std::numeric_limits<uint64_t>::max();
5502 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5507 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5508 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5509 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5510 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5511 Opc == AMDGPU::S_XOR_B32;
5525 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5530 case AMDGPU::S_MIN_U32:
5531 case AMDGPU::S_MIN_I32:
5532 case AMDGPU::S_MAX_U32:
5533 case AMDGPU::S_MAX_I32:
5534 case AMDGPU::S_AND_B32:
5535 case AMDGPU::S_OR_B32: {
5541 case AMDGPU::V_CMP_LT_U64_e64:
5542 case AMDGPU::V_CMP_LT_I64_e64:
5543 case AMDGPU::V_CMP_GT_U64_e64:
5544 case AMDGPU::V_CMP_GT_I64_e64:
5545 case AMDGPU::S_AND_B64:
5546 case AMDGPU::S_OR_B64: {
5552 case AMDGPU::S_XOR_B32:
5553 case AMDGPU::S_XOR_B64:
5554 case AMDGPU::S_ADD_I32:
5555 case AMDGPU::S_ADD_U64_PSEUDO:
5556 case AMDGPU::S_SUB_I32:
5557 case AMDGPU::S_SUB_U64_PSEUDO: {
5560 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5562 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5564 bool IsWave32 = ST.isWave32();
5565 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5566 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5567 unsigned BitCountOpc =
5568 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5572 auto NewAccumulator =
5577 case AMDGPU::S_XOR_B32:
5578 case AMDGPU::S_XOR_B64: {
5584 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5587 .
addReg(NewAccumulator->getOperand(0).getReg())
5590 if (
Opc == AMDGPU::S_XOR_B32) {
5596 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5598 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5602 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5605 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5607 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5617 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5625 case AMDGPU::S_SUB_I32: {
5626 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5634 .
addReg(NewAccumulator->getOperand(0).getReg());
5637 case AMDGPU::S_ADD_I32: {
5640 .
addReg(NewAccumulator->getOperand(0).getReg());
5643 case AMDGPU::S_ADD_U64_PSEUDO:
5644 case AMDGPU::S_SUB_U64_PSEUDO: {
5645 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5646 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5648 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5650 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5651 Register CarryReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5652 Register AddReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5654 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5656 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5660 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5663 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5665 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5667 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5670 .
addReg(NewAccumulator->getOperand(0).getReg())
5680 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5682 : NewAccumulator->getOperand(0).getReg();
5693 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5699 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5705 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5737 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5738 Register IdentityValReg =
MRI.createVirtualRegister(DstRegClass);
5739 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5740 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5741 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5742 Register FF1Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5743 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
5745 bool IsWave32 = ST.isWave32();
5746 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5747 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5754 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5758 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5767 I = ComputeLoop->begin();
5769 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5773 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5777 I = ComputeLoop->end();
5780 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5784 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5793 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5795 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5796 Register LaneValReg =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5799 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5801 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5803 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5805 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5809 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5813 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5814 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5820 case AMDGPU::S_OR_B64:
5821 case AMDGPU::S_AND_B64:
5822 case AMDGPU::S_XOR_B64: {
5825 .
addReg(LaneValue->getOperand(0).getReg())
5829 case AMDGPU::V_CMP_GT_I64_e64:
5830 case AMDGPU::V_CMP_GT_U64_e64:
5831 case AMDGPU::V_CMP_LT_I64_e64:
5832 case AMDGPU::V_CMP_LT_U64_e64: {
5833 Register LaneMaskReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5835 MRI.createVirtualRegister(WaveMaskRegClass);
5838 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5839 Register AccumulatorVReg =
MRI.createVirtualRegister(VregClass);
5842 VregClass, AMDGPU::sub0, VSubRegClass);
5845 VregClass, AMDGPU::sub1, VSubRegClass);
5846 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
5853 .
addReg(LaneValue->getOperand(0).getReg())
5854 .
addReg(AccumulatorVReg);
5856 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5857 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
5861 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5862 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5863 .
addReg(LaneValue->getOperand(0).getReg())
5867 case AMDGPU::S_ADD_U64_PSEUDO:
5868 case AMDGPU::S_SUB_U64_PSEUDO: {
5871 .
addReg(LaneValue->getOperand(0).getReg());
5878 unsigned BITSETOpc =
5879 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5880 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5886 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5889 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5891 .
addReg(NewActiveBitsReg)
5893 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5898 MI.eraseFromParent();
5911 switch (
MI.getOpcode()) {
5912 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5914 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5916 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5918 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5920 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5922 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5924 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5926 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5928 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5930 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5932 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5934 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5936 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5938 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5940 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5942 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5944 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5946 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5948 case AMDGPU::S_UADDO_PSEUDO:
5949 case AMDGPU::S_USUBO_PSEUDO: {
5956 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5958 : AMDGPU::S_SUB_U32;
5969 MI.eraseFromParent();
5972 case AMDGPU::S_ADD_U64_PSEUDO:
5973 case AMDGPU::S_SUB_U64_PSEUDO: {
5976 case AMDGPU::V_ADD_U64_PSEUDO:
5977 case AMDGPU::V_SUB_U64_PSEUDO: {
5981 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5987 if (ST.hasAddSubU64Insts()) {
5989 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5990 : AMDGPU::V_SUB_U64_e64),
5995 TII->legalizeOperands(*
I);
5996 MI.eraseFromParent();
6000 if (IsAdd && ST.hasLshlAddU64Inst()) {
6006 TII->legalizeOperands(*
Add);
6007 MI.eraseFromParent();
6011 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6013 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6014 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6016 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
6017 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
6021 : &AMDGPU::VReg_64RegClass;
6024 : &AMDGPU::VReg_64RegClass;
6027 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6029 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6032 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6034 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6037 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6039 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6042 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6049 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6063 TII->legalizeOperands(*LoHalf);
6064 TII->legalizeOperands(*HiHalf);
6065 MI.eraseFromParent();
6068 case AMDGPU::S_ADD_CO_PSEUDO:
6069 case AMDGPU::S_SUB_CO_PSEUDO: {
6081 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
6082 ? AMDGPU::S_ADDC_U32
6083 : AMDGPU::S_SUBB_U32;
6085 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6086 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6091 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6092 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6096 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6098 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6103 if (ST.isWave64()) {
6104 if (ST.hasScalarCompareEq64()) {
6111 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6113 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6115 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6116 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6118 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6139 (ST.isWave64()) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6145 MI.eraseFromParent();
6148 case AMDGPU::SI_INIT_M0: {
6151 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6154 MI.eraseFromParent();
6157 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6160 TII->get(AMDGPU::S_CMP_EQ_U32))
6165 case AMDGPU::GET_GROUPSTATICSIZE: {
6170 .
add(
MI.getOperand(0))
6172 MI.eraseFromParent();
6175 case AMDGPU::GET_SHADERCYCLESHILO: {
6190 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6192 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6193 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6195 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6196 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6198 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6202 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6207 .
add(
MI.getOperand(0))
6212 MI.eraseFromParent();
6215 case AMDGPU::SI_INDIRECT_SRC_V1:
6216 case AMDGPU::SI_INDIRECT_SRC_V2:
6217 case AMDGPU::SI_INDIRECT_SRC_V4:
6218 case AMDGPU::SI_INDIRECT_SRC_V8:
6219 case AMDGPU::SI_INDIRECT_SRC_V9:
6220 case AMDGPU::SI_INDIRECT_SRC_V10:
6221 case AMDGPU::SI_INDIRECT_SRC_V11:
6222 case AMDGPU::SI_INDIRECT_SRC_V12:
6223 case AMDGPU::SI_INDIRECT_SRC_V16:
6224 case AMDGPU::SI_INDIRECT_SRC_V32:
6226 case AMDGPU::SI_INDIRECT_DST_V1:
6227 case AMDGPU::SI_INDIRECT_DST_V2:
6228 case AMDGPU::SI_INDIRECT_DST_V4:
6229 case AMDGPU::SI_INDIRECT_DST_V8:
6230 case AMDGPU::SI_INDIRECT_DST_V9:
6231 case AMDGPU::SI_INDIRECT_DST_V10:
6232 case AMDGPU::SI_INDIRECT_DST_V11:
6233 case AMDGPU::SI_INDIRECT_DST_V12:
6234 case AMDGPU::SI_INDIRECT_DST_V16:
6235 case AMDGPU::SI_INDIRECT_DST_V32:
6237 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6238 case AMDGPU::SI_KILL_I1_PSEUDO:
6240 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6247 Register SrcCond =
MI.getOperand(3).getReg();
6249 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6250 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6251 const auto *CondRC =
TRI->getWaveMaskRegClass();
6252 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
6256 : &AMDGPU::VReg_64RegClass;
6259 : &AMDGPU::VReg_64RegClass;
6262 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6264 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6267 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6269 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6272 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6274 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6295 MI.eraseFromParent();
6298 case AMDGPU::SI_BR_UNDEF: {
6301 .
add(
MI.getOperand(0));
6303 MI.eraseFromParent();
6306 case AMDGPU::ADJCALLSTACKUP:
6307 case AMDGPU::ADJCALLSTACKDOWN: {
6314 case AMDGPU::SI_CALL_ISEL: {
6317 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
6320 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6326 MI.eraseFromParent();
6329 case AMDGPU::V_ADD_CO_U32_e32:
6330 case AMDGPU::V_SUB_CO_U32_e32:
6331 case AMDGPU::V_SUBREV_CO_U32_e32: {
6334 unsigned Opc =
MI.getOpcode();
6336 bool NeedClampOperand =
false;
6337 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
6339 NeedClampOperand =
true;
6343 if (
TII->isVOP3(*
I)) {
6346 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
6347 if (NeedClampOperand)
6350 TII->legalizeOperands(*
I);
6352 MI.eraseFromParent();
6355 case AMDGPU::V_ADDC_U32_e32:
6356 case AMDGPU::V_SUBB_U32_e32:
6357 case AMDGPU::V_SUBBREV_U32_e32:
6360 TII->legalizeOperands(
MI);
6362 case AMDGPU::DS_GWS_INIT:
6363 case AMDGPU::DS_GWS_SEMA_BR:
6364 case AMDGPU::DS_GWS_BARRIER:
6365 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
6367 case AMDGPU::DS_GWS_SEMA_V:
6368 case AMDGPU::DS_GWS_SEMA_P:
6369 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6377 case AMDGPU::S_SETREG_B32: {
6393 const unsigned SetMask = WidthMask <<
Offset;
6396 unsigned SetDenormOp = 0;
6397 unsigned SetRoundOp = 0;
6405 SetRoundOp = AMDGPU::S_ROUND_MODE;
6406 SetDenormOp = AMDGPU::S_DENORM_MODE;
6408 SetRoundOp = AMDGPU::S_ROUND_MODE;
6410 SetDenormOp = AMDGPU::S_DENORM_MODE;
6413 if (SetRoundOp || SetDenormOp) {
6416 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6417 unsigned ImmVal = Def->getOperand(1).getImm();
6431 MI.eraseFromParent();
6440 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6444 case AMDGPU::S_INVERSE_BALLOT_U32:
6445 case AMDGPU::S_INVERSE_BALLOT_U64:
6448 MI.setDesc(
TII->get(AMDGPU::COPY));
6450 case AMDGPU::ENDPGM_TRAP: {
6453 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6473 MI.eraseFromParent();
6476 case AMDGPU::SIMULATED_TRAP: {
6477 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6480 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6481 MI.eraseFromParent();
6484 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6485 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6491 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6492 Register OriginalExec = Setup->getOperand(0).getReg();
6494 MI.getOperand(0).setReg(OriginalExec);
6531 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6535 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6562 if (!Subtarget->hasMadMacF32Insts())
6563 return Subtarget->hasFastFMAF32();
6569 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6572 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6588 switch (Ty.getScalarSizeInBits()) {
6606 if (Ty.getScalarSizeInBits() == 16)
6608 if (Ty.getScalarSizeInBits() == 32)
6609 return Subtarget->hasMadMacF32Insts() &&
6619 EVT VT =
N->getValueType(0);
6621 return Subtarget->hasMadMacF32Insts() &&
6623 if (VT == MVT::f16) {
6624 return Subtarget->hasMadF16() &&
6639 unsigned Opc =
Op.getOpcode();
6640 EVT VT =
Op.getValueType();
6641 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6642 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6643 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6644 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6660 [[maybe_unused]]
EVT VT =
Op.getValueType();
6662 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6663 VT == MVT::v16i32) &&
6664 "Unexpected ValueType.");
6673 unsigned Opc =
Op.getOpcode();
6674 EVT VT =
Op.getValueType();
6675 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6676 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6677 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6678 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6679 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6680 VT == MVT::v32bf16);
6688 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6690 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6697 unsigned Opc =
Op.getOpcode();
6698 EVT VT =
Op.getValueType();
6699 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6700 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6701 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6702 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6703 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6704 VT == MVT::v32bf16);
6709 : std::pair(Op0, Op0);
6718 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
6720 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
6726 switch (
Op.getOpcode()) {
6730 return LowerBRCOND(
Op, DAG);
6732 return LowerRETURNADDR(
Op, DAG);
6735 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6736 "Load should return a value and a chain");
6740 EVT VT =
Op.getValueType();
6742 return lowerFSQRTF32(
Op, DAG);
6744 return lowerFSQRTF64(
Op, DAG);
6749 return LowerTrig(
Op, DAG);
6751 return LowerSELECT(
Op, DAG);
6753 return LowerFDIV(
Op, DAG);
6755 return LowerFFREXP(
Op, DAG);
6756 case ISD::ATOMIC_CMP_SWAP:
6757 return LowerATOMIC_CMP_SWAP(
Op, DAG);
6759 return LowerSTORE(
Op, DAG);
6763 return LowerGlobalAddress(MFI,
Op, DAG);
6766 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
6768 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
6770 return LowerINTRINSIC_VOID(
Op, DAG);
6771 case ISD::ADDRSPACECAST:
6772 return lowerADDRSPACECAST(
Op, DAG);
6774 return lowerINSERT_SUBVECTOR(
Op, DAG);
6776 return lowerINSERT_VECTOR_ELT(
Op, DAG);
6778 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
6780 return lowerVECTOR_SHUFFLE(
Op, DAG);
6782 return lowerSCALAR_TO_VECTOR(
Op, DAG);
6784 return lowerBUILD_VECTOR(
Op, DAG);
6787 return lowerFP_ROUND(
Op, DAG);
6789 return lowerTRAP(
Op, DAG);
6790 case ISD::DEBUGTRAP:
6791 return lowerDEBUGTRAP(
Op, DAG);
6800 return lowerFMINNUM_FMAXNUM(
Op, DAG);
6801 case ISD::FMINIMUMNUM:
6802 case ISD::FMAXIMUMNUM:
6803 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
6806 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
6809 return lowerFLDEXP(
Op, DAG);
6826 case ISD::FMINNUM_IEEE:
6827 case ISD::FMAXNUM_IEEE:
6834 return lowerFCOPYSIGN(
Op, DAG);
6836 return lowerMUL(
Op, DAG);
6839 return lowerXMULO(
Op, DAG);
6842 return lowerXMUL_LOHI(
Op, DAG);
6843 case ISD::DYNAMIC_STACKALLOC:
6845 case ISD::STACKSAVE:
6849 case ISD::SET_ROUNDING:
6853 case ISD::FP_EXTEND:
6856 case ISD::GET_FPENV:
6858 case ISD::SET_FPENV:
6877 EVT FittingLoadVT = LoadVT;
6902 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6906 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6909SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6912 bool IsIntrinsic)
const {
6915 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6916 EVT LoadVT =
M->getValueType(0);
6918 EVT EquivLoadVT = LoadVT;
6932 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
6936 M->getMemoryVT(),
M->getMemOperand());
6947 EVT LoadVT =
M->getValueType(0);
6953 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
6954 bool IsTFE =
M->getNumValues() == 3;
6967 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
6971 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
6972 M->getMemOperand(), DAG);
6976 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
6978 M->getMemOperand(), DAG);
6986 EVT VT =
N->getValueType(0);
6987 unsigned CondCode =
N->getConstantOperandVal(3);
6998 EVT CmpVT =
LHS.getValueType();
6999 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
7000 unsigned PromoteOp =
7020 EVT VT =
N->getValueType(0);
7022 unsigned CondCode =
N->getConstantOperandVal(3);
7031 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7032 Src0 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7033 Src1 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7049 EVT VT =
N->getValueType(0);
7056 Src.getOperand(1), Src.getOperand(2));
7067 Exec = AMDGPU::EXEC_LO;
7069 Exec = AMDGPU::EXEC;
7086 EVT VT =
N->getValueType(0);
7088 unsigned IID =
N->getConstantOperandVal(0);
7089 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7090 IID == Intrinsic::amdgcn_permlanex16;
7091 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7092 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7096 unsigned SplitSize = 32;
7097 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7098 ST->hasDPALU_DPP() &&
7106 case Intrinsic::amdgcn_permlane16:
7107 case Intrinsic::amdgcn_permlanex16:
7108 case Intrinsic::amdgcn_update_dpp:
7113 case Intrinsic::amdgcn_writelane:
7116 case Intrinsic::amdgcn_readlane:
7117 case Intrinsic::amdgcn_set_inactive:
7118 case Intrinsic::amdgcn_set_inactive_chain_arg:
7119 case Intrinsic::amdgcn_mov_dpp8:
7122 case Intrinsic::amdgcn_readfirstlane:
7123 case Intrinsic::amdgcn_permlane64:
7133 if (
SDNode *GL =
N->getGluedNode()) {
7134 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7135 GL = GL->getOperand(0).getNode();
7136 Operands.push_back(DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7145 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7146 IID == Intrinsic::amdgcn_mov_dpp8 ||
7147 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7148 Src1 =
N->getOperand(2);
7149 if (IID == Intrinsic::amdgcn_writelane ||
7150 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7151 Src2 =
N->getOperand(3);
7154 if (ValSize == SplitSize) {
7164 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7169 if (IID == Intrinsic::amdgcn_writelane) {
7174 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7176 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7179 if (ValSize % SplitSize != 0)
7183 EVT VT =
N->getValueType(0);
7187 unsigned NumOperands =
N->getNumOperands();
7189 SDNode *GL =
N->getGluedNode();
7194 for (
unsigned i = 0; i != NE; ++i) {
7195 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7197 SDValue Operand =
N->getOperand(j);
7212 DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7227 if (SplitSize == 32) {
7229 return unrollLaneOp(LaneOp.
getNode());
7235 unsigned SubVecNumElt =
7239 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7240 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7244 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7249 if (IID == Intrinsic::amdgcn_writelane)
7254 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7255 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7256 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7257 EltIdx += SubVecNumElt;
7271 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7274 if (IID == Intrinsic::amdgcn_writelane)
7277 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7285 switch (
N->getOpcode()) {
7297 unsigned IID =
N->getConstantOperandVal(0);
7299 case Intrinsic::amdgcn_make_buffer_rsrc:
7300 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
7302 case Intrinsic::amdgcn_cvt_pkrtz: {
7308 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7311 case Intrinsic::amdgcn_cvt_pknorm_i16:
7312 case Intrinsic::amdgcn_cvt_pknorm_u16:
7313 case Intrinsic::amdgcn_cvt_pk_i16:
7314 case Intrinsic::amdgcn_cvt_pk_u16: {
7320 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7322 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7324 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7329 EVT VT =
N->getValueType(0);
7334 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7338 case Intrinsic::amdgcn_s_buffer_load: {
7344 if (!Subtarget->hasScalarSubwordLoads())
7350 EVT VT =
Op.getValueType();
7351 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
7363 if (!
Offset->isDivergent()) {
7382 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
7387 case Intrinsic::amdgcn_dead: {
7388 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7399 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7400 Results.push_back(Res.getOperand(
I));
7404 Results.push_back(Res.getValue(1));
7413 EVT VT =
N->getValueType(0);
7418 EVT SelectVT = NewVT;
7419 if (NewVT.
bitsLT(MVT::i32)) {
7422 SelectVT = MVT::i32;
7428 if (NewVT != SelectVT)
7434 if (
N->getValueType(0) != MVT::v2f16)
7438 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7446 if (
N->getValueType(0) != MVT::v2f16)
7450 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7458 if (
N->getValueType(0) != MVT::f16)
7473 if (U.get() !=
Value)
7476 if (U.getUser()->getOpcode() == Opcode)
7482unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
7485 case Intrinsic::amdgcn_if:
7487 case Intrinsic::amdgcn_else:
7489 case Intrinsic::amdgcn_loop:
7491 case Intrinsic::amdgcn_end_cf:
7511 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7538 SDNode *Intr = BRCOND.getOperand(1).getNode();
7551 assert(BR &&
"brcond missing unconditional branch user");
7555 unsigned CFNode = isCFIntrinsic(Intr);
7575 Ops.push_back(Target);
7598 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
7617 MVT VT =
Op.getSimpleValueType();
7620 if (
Op.getConstantOperandVal(0) != 0)
7624 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7626 if (
Info->isEntryFunction())
7643 return Op.getValueType().bitsLE(VT)
7651 EVT DstVT =
Op.getValueType();
7658 unsigned Opc =
Op.getOpcode();
7670 EVT SrcVT = Src.getValueType();
7671 EVT DstVT =
Op.getValueType();
7674 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
7677 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
7684 if (DstVT == MVT::f16) {
7689 if (!Subtarget->has16BitInsts()) {
7692 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7694 if (
Op->getFlags().hasApproximateFuncs()) {
7701 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7705 "custom lower FP_ROUND for f16 or bf16");
7706 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
7719 EVT VT =
Op.getValueType();
7721 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7722 bool IsIEEEMode =
Info->getMode().IEEE;
7731 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7738SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
7740 EVT VT =
Op.getValueType();
7742 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7743 bool IsIEEEMode =
Info->getMode().IEEE;
7748 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7756 EVT VT =
Op.getValueType();
7760 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7761 !Subtarget->hasMinimum3Maximum3F16() &&
7762 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7763 "should not need to widen f16 minimum/maximum to v2f16");
7777 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7785 EVT VT =
Op.getValueType();
7789 EVT ExpVT =
Exp.getValueType();
7790 if (ExpVT == MVT::i16)
7811 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
7814 return DAG.
getNode(ISD::FLDEXP,
DL, VT,
Op.getOperand(0), TruncExp);
7818 switch (
Op->getOpcode()) {
7848 DAGCombinerInfo &DCI)
const {
7849 const unsigned Opc =
Op.getOpcode();
7857 :
Op->getOperand(0).getValueType();
7860 if (DCI.isBeforeLegalizeOps() ||
7864 auto &DAG = DCI.DAG;
7870 LHS =
Op->getOperand(1);
7871 RHS =
Op->getOperand(2);
7873 LHS =
Op->getOperand(0);
7874 RHS =
Op->getOperand(1);
7913 if (MagVT == SignVT)
7920 SDValue SignAsInt32 = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7923 SDValue SignAsHalf16 = DAG.
getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7930 EVT VT =
Op.getValueType();
7936 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
7963 if (
Op->isDivergent())
7976 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7978 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7981 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7983 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7989 EVT VT =
Op.getValueType();
7996 const APInt &
C = RHSC->getAPIntValue();
7998 if (
C.isPowerOf2()) {
8000 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
8027 if (
Op->isDivergent()) {
8031 if (Subtarget->hasSMulHi()) {
8042 if (!Subtarget->isTrapHandlerEnabled() ||
8044 return lowerTrapEndpgm(
Op, DAG);
8046 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
8047 : lowerTrapHsaQueuePtr(
Op, DAG);
8057SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
8059 ImplicitParameter Param)
const {
8079 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
8082 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8085 if (UserSGPR == AMDGPU::NoRegister) {
8111 if (Subtarget->hasPrivEnabledTrap2NopBug())
8124 if (!Subtarget->isTrapHandlerEnabled() ||
8128 "debugtrap handler not supported",
8139SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
8141 if (Subtarget->hasApertureRegs()) {
8143 ? AMDGPU::SRC_SHARED_BASE
8144 : AMDGPU::SRC_PRIVATE_BASE;
8145 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8146 !Subtarget->hasGloballyAddressableScratch()) &&
8147 "Cannot use src_private_base with globally addressable scratch!");
8168 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
8172 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8174 if (UserSGPR == AMDGPU::NoRegister) {
8208 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8219 const AMDGPUTargetMachine &TM =
8222 unsigned DestAS, SrcAS;
8224 bool IsNonNull =
false;
8226 SrcAS = ASC->getSrcAddressSpace();
8227 Src = ASC->getOperand(0);
8228 DestAS = ASC->getDestAddressSpace();
8231 Op.getConstantOperandVal(0) ==
8232 Intrinsic::amdgcn_addrspacecast_nonnull);
8233 Src =
Op->getOperand(1);
8234 SrcAS =
Op->getConstantOperandVal(2);
8235 DestAS =
Op->getConstantOperandVal(3);
8248 Subtarget->hasGloballyAddressableScratch()) {
8253 AMDGPU::S_MOV_B32, SL, MVT::i32,
8254 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8262 unsigned NullVal = TM.getNullPointerValue(DestAS);
8277 Subtarget->hasGloballyAddressableScratch()) {
8286 if (Subtarget->isWave64())
8292 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8295 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8300 AMDGPU::S_MOV_B64, SL, MVT::i64,
8301 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8303 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8305 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8307 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8313 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8325 Op.getValueType() == MVT::i64) {
8326 const SIMachineFunctionInfo *
Info =
8330 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8334 Src.getValueType() == MVT::i64)
8354 EVT InsVT =
Ins.getValueType();
8362 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
8367 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8369 MVT::i32, InsNumElts / 2);
8371 Vec = DAG.
getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8372 Ins = DAG.
getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8374 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
8376 if (InsNumElts == 2) {
8386 return DAG.
getNode(ISD::BITCAST, SL, VecVT, Vec);
8389 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
8412 if (NumElts == 4 && EltSize == 16 && KIdx) {
8420 SDValue LoVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8421 SDValue HiVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8423 unsigned Idx = KIdx->getZExtValue();
8424 bool InsertLo = Idx < 2;
8427 DAG.
getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8428 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8430 InsHalf = DAG.
getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8434 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8447 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
8475 return DAG.
getNode(ISD::BITCAST, SL, VecVT, BFI);
8482 EVT ResultVT =
Op.getValueType();
8495 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8498 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8502 if (VecSize == 128) {
8510 }
else if (VecSize == 256) {
8513 for (
unsigned P = 0;
P < 4; ++
P) {
8519 Parts[0], Parts[1]));
8521 Parts[2], Parts[3]));
8527 for (
unsigned P = 0;
P < 8; ++
P) {
8534 Parts[0], Parts[1], Parts[2], Parts[3]));
8537 Parts[4], Parts[5], Parts[6], Parts[7]));
8557 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8572 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8574 return DAG.
getNode(ISD::BITCAST, SL, ResultVT, Result);
8582 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8587 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8588 !(Mask[Elt + 1] & 1);
8594 EVT ResultVT =
Op.getValueType();
8597 const int NewSrcNumElts = 2;
8599 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
8615 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
8637 if (ShouldUseConsecutiveExtract &&
8640 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8641 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8653 if (Idx0 >= SrcNumElts) {
8658 if (Idx1 >= SrcNumElts) {
8663 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8664 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8672 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8673 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8678 if (SubVec0 != SubVec1) {
8679 NewMaskIdx1 += NewSrcNumElts;
8686 {NewMaskIdx0, NewMaskIdx1});
8691 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8692 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8693 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8694 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8713 EVT ResultVT =
Op.getValueType();
8729 EVT VT =
Op.getValueType();
8731 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8732 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
8741 return DAG.
getNode(ISD::BITCAST, SL, VT, ExtLo);
8750 return DAG.
getNode(ISD::BITCAST, SL, VT, ShlHi);
8757 return DAG.
getNode(ISD::BITCAST, SL, VT,
Or);
8766 for (
unsigned P = 0;
P < NumParts; ++
P) {
8768 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
8774 return DAG.
getNode(ISD::BITCAST, SL, VT, Blend);
8787 if (!Subtarget->isAmdHsaOS())
8847 EVT PtrVT =
Op.getValueType();
8849 const GlobalValue *GV = GSD->
getGlobal();
8863 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
8881 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8882 if (Subtarget->has64BitLiterals()) {
8913 MachinePointerInfo PtrInfo =
8941 SDValue Param = lowerKernargMemParameter(
8952 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
8960 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
8968 unsigned NumElts = Elts.
size();
8970 if (NumElts <= 12) {
8979 for (
unsigned i = 0; i < Elts.
size(); ++i) {
8985 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
8995 EVT SrcVT = Src.getValueType();
9016 bool Unpacked,
bool IsD16,
int DMaskPop,
9017 int NumVDataDwords,
bool IsAtomicPacked16Bit,
9021 EVT ReqRetVT = ResultTypes[0];
9023 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9024 ? (ReqRetNumElts + 1) / 2
9027 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9038 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
9049 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
9051 NumDataDwords - MaskPopDwords);
9056 EVT LegalReqRetVT = ReqRetVT;
9058 if (!
Data.getValueType().isInteger())
9060 Data.getValueType().changeTypeToInteger(),
Data);
9081 if (Result->getNumValues() == 1)
9088 SDValue *LWE,
bool &IsTexFail) {
9108 unsigned DimIdx,
unsigned EndIdx,
9109 unsigned NumGradients) {
9111 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
9119 if (((
I + 1) >= EndIdx) ||
9120 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
9121 I == DimIdx + NumGradients - 1))) {
9140 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9154 int NumVDataDwords = 0;
9155 bool AdjustRetType =
false;
9156 bool IsAtomicPacked16Bit =
false;
9159 const unsigned ArgOffset = WithChain ? 2 : 1;
9162 unsigned DMaskLanes = 0;
9164 if (BaseOpcode->Atomic) {
9165 VData =
Op.getOperand(2);
9167 IsAtomicPacked16Bit =
9168 (Intr->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9169 Intr->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
9172 if (BaseOpcode->AtomicX2) {
9179 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9180 DMask = Is64Bit ? 0xf : 0x3;
9181 NumVDataDwords = Is64Bit ? 4 : 2;
9183 DMask = Is64Bit ? 0x3 : 0x1;
9184 NumVDataDwords = Is64Bit ? 2 : 1;
9187 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
9190 if (BaseOpcode->Store) {
9191 VData =
Op.getOperand(2);
9195 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9199 VData = handleD16VData(VData, DAG,
true);
9202 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
9203 }
else if (!BaseOpcode->NoReturn) {
9208 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9216 (!LoadVT.
isVector() && DMaskLanes > 1))
9222 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9223 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9224 NumVDataDwords = (DMaskLanes + 1) / 2;
9226 NumVDataDwords = DMaskLanes;
9228 AdjustRetType =
true;
9232 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
9239 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9240 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9242 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
9244 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9245 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9249 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
9255 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
9259 "Bias needs to be converted to 16 bit in A16 mode");
9264 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
9268 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
9269 "require 16 bit args for both gradients and addresses");
9274 if (!
ST->hasA16()) {
9275 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
9276 "support 16 bit addresses\n");
9286 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
9288 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9290 IntrOpcode = G16MappingInfo->
G16;
9313 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
9331 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
9332 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
9333 const bool UseNSA =
ST->hasNSAEncoding() &&
9334 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
9335 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
9336 const bool UsePartialNSA =
9337 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
9340 if (UsePartialNSA) {
9342 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9343 }
else if (!UseNSA) {
9350 if (!BaseOpcode->Sampler) {
9353 uint64_t UnormConst =
9354 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
9356 Unorm = UnormConst ? True : False;
9362 bool IsTexFail =
false;
9363 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9374 NumVDataDwords += 1;
9375 AdjustRetType =
true;
9380 if (AdjustRetType) {
9383 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9392 MVT::i32, NumVDataDwords)
9395 ResultTypes[0] = NewVT;
9396 if (ResultTypes.size() == 3) {
9400 ResultTypes.erase(&ResultTypes[1]);
9405 if (BaseOpcode->Atomic)
9412 if (BaseOpcode->Store || BaseOpcode->Atomic)
9413 Ops.push_back(VData);
9414 if (UsePartialNSA) {
9416 Ops.push_back(VAddr);
9420 Ops.push_back(VAddr);
9423 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9425 Ops.push_back(Rsrc);
9426 if (BaseOpcode->Sampler) {
9430 Ops.push_back(Samp);
9435 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9436 Ops.push_back(Unorm);
9438 Ops.push_back(IsA16 &&
9439 ST->hasFeature(AMDGPU::FeatureR128A16)
9443 Ops.push_back(IsA16 ? True : False);
9445 if (!Subtarget->hasGFX90AInsts())
9450 "TFE is not supported on this GPU",
DL.getDebugLoc()));
9453 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9456 Ops.push_back(DimInfo->
DA ? True : False);
9457 if (BaseOpcode->HasD16)
9458 Ops.push_back(IsD16 ? True : False);
9460 Ops.push_back(
Op.getOperand(0));
9462 int NumVAddrDwords =
9468 NumVDataDwords, NumVAddrDwords);
9469 }
else if (IsGFX11Plus) {
9471 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9472 : AMDGPU::MIMGEncGfx11Default,
9473 NumVDataDwords, NumVAddrDwords);
9474 }
else if (IsGFX10Plus) {
9476 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9477 : AMDGPU::MIMGEncGfx10Default,
9478 NumVDataDwords, NumVAddrDwords);
9480 if (Subtarget->hasGFX90AInsts()) {
9482 NumVDataDwords, NumVAddrDwords);
9486 "requested image instruction is not supported on this GPU",
9491 for (EVT VT : OrigResultTypes) {
9492 if (VT == MVT::Other)
9493 RetValues[Idx++] =
Op.getOperand(0);
9504 NumVDataDwords, NumVAddrDwords);
9507 NumVDataDwords, NumVAddrDwords);
9514 MachineMemOperand *MemRef = MemOp->getMemOperand();
9518 if (BaseOpcode->AtomicX2) {
9523 if (BaseOpcode->NoReturn)
9526 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9527 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9540 MachinePointerInfo(),
9545 if (!
Offset->isDivergent()) {
9552 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9561 !Subtarget->hasScalarDwordx3Loads()) {
9588 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9590 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
9594 unsigned NumLoads = 1;
9600 if (NumElts == 8 || NumElts == 16) {
9601 NumLoads = NumElts / 4;
9605 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
9610 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
9612 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
9613 for (
unsigned i = 0; i < NumLoads; ++i) {
9619 if (NumElts == 8 || NumElts == 16)
9627 if (!Subtarget->hasArchitectedSGPRs())
9639 unsigned Width)
const {
9641 using namespace AMDGPU::Hwreg;
9643 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9682 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
9684 EVT VT =
Op.getValueType();
9686 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
9690 switch (IntrinsicID) {
9691 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9694 return getPreloadedValue(DAG, *MFI, VT,
9697 case Intrinsic::amdgcn_dispatch_ptr:
9698 case Intrinsic::amdgcn_queue_ptr: {
9699 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
9701 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
9706 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9709 return getPreloadedValue(DAG, *MFI, VT, RegID);
9711 case Intrinsic::amdgcn_implicitarg_ptr: {
9713 return getImplicitArgPtr(DAG,
DL);
9714 return getPreloadedValue(DAG, *MFI, VT,
9717 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9723 return getPreloadedValue(DAG, *MFI, VT,
9726 case Intrinsic::amdgcn_dispatch_id: {
9729 case Intrinsic::amdgcn_rcp:
9731 case Intrinsic::amdgcn_rsq:
9733 case Intrinsic::amdgcn_rsq_legacy:
9737 case Intrinsic::amdgcn_rcp_legacy:
9741 case Intrinsic::amdgcn_rsq_clamp: {
9752 return DAG.
getNode(ISD::FMAXNUM,
DL, VT, Tmp,
9755 case Intrinsic::r600_read_ngroups_x:
9756 if (Subtarget->isAmdHsaOS())
9759 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9762 case Intrinsic::r600_read_ngroups_y:
9763 if (Subtarget->isAmdHsaOS())
9766 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9769 case Intrinsic::r600_read_ngroups_z:
9770 if (Subtarget->isAmdHsaOS())
9773 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9776 case Intrinsic::r600_read_local_size_x:
9777 if (Subtarget->isAmdHsaOS())
9780 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9782 case Intrinsic::r600_read_local_size_y:
9783 if (Subtarget->isAmdHsaOS())
9786 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9788 case Intrinsic::r600_read_local_size_z:
9789 if (Subtarget->isAmdHsaOS())
9792 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9794 case Intrinsic::amdgcn_workgroup_id_x:
9795 return lowerWorkGroupId(DAG, *MFI, VT,
9799 case Intrinsic::amdgcn_workgroup_id_y:
9800 return lowerWorkGroupId(DAG, *MFI, VT,
9804 case Intrinsic::amdgcn_workgroup_id_z:
9805 return lowerWorkGroupId(DAG, *MFI, VT,
9809 case Intrinsic::amdgcn_cluster_id_x:
9810 return Subtarget->hasClusters()
9811 ? getPreloadedValue(DAG, *MFI, VT,
9813 : DAG.getPOISON(VT);
9814 case Intrinsic::amdgcn_cluster_id_y:
9815 return Subtarget->hasClusters()
9816 ? getPreloadedValue(DAG, *MFI, VT,
9819 case Intrinsic::amdgcn_cluster_id_z:
9820 return Subtarget->hasClusters()
9821 ? getPreloadedValue(DAG, *MFI, VT,
9824 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9825 return Subtarget->hasClusters()
9826 ? getPreloadedValue(
9830 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9831 return Subtarget->hasClusters()
9832 ? getPreloadedValue(
9836 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9837 return Subtarget->hasClusters()
9838 ? getPreloadedValue(
9842 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9843 return Subtarget->hasClusters()
9846 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9847 return Subtarget->hasClusters()
9848 ? getPreloadedValue(
9852 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9853 return Subtarget->hasClusters()
9854 ? getPreloadedValue(
9858 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9859 return Subtarget->hasClusters()
9860 ? getPreloadedValue(
9864 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9865 return Subtarget->hasClusters()
9866 ? getPreloadedValue(
9870 case Intrinsic::amdgcn_wave_id:
9871 return lowerWaveID(DAG,
Op);
9872 case Intrinsic::amdgcn_lds_kernel_id: {
9874 return getLDSKernelId(DAG,
DL);
9875 return getPreloadedValue(DAG, *MFI, VT,
9878 case Intrinsic::amdgcn_workitem_id_x:
9879 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
9880 case Intrinsic::amdgcn_workitem_id_y:
9881 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
9882 case Intrinsic::amdgcn_workitem_id_z:
9883 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
9884 case Intrinsic::amdgcn_wavefrontsize:
9886 SDLoc(
Op), MVT::i32);
9887 case Intrinsic::amdgcn_s_buffer_load: {
9888 unsigned CPol =
Op.getConstantOperandVal(3);
9895 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
9896 Op.getOperand(3), DAG);
9898 case Intrinsic::amdgcn_fdiv_fast:
9899 return lowerFDIV_FAST(
Op, DAG);
9900 case Intrinsic::amdgcn_sin:
9903 case Intrinsic::amdgcn_cos:
9906 case Intrinsic::amdgcn_mul_u24:
9909 case Intrinsic::amdgcn_mul_i24:
9913 case Intrinsic::amdgcn_log_clamp: {
9919 case Intrinsic::amdgcn_fract:
9922 case Intrinsic::amdgcn_class:
9925 case Intrinsic::amdgcn_div_fmas:
9927 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9929 case Intrinsic::amdgcn_div_fixup:
9931 Op.getOperand(2),
Op.getOperand(3));
9933 case Intrinsic::amdgcn_div_scale: {
9946 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
9949 Denominator, Numerator);
9951 case Intrinsic::amdgcn_icmp: {
9953 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
9954 Op.getConstantOperandVal(2) == 0 &&
9959 case Intrinsic::amdgcn_fcmp: {
9962 case Intrinsic::amdgcn_ballot:
9964 case Intrinsic::amdgcn_fmed3:
9966 Op.getOperand(2),
Op.getOperand(3));
9967 case Intrinsic::amdgcn_fdot2:
9969 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9970 case Intrinsic::amdgcn_fmul_legacy:
9973 case Intrinsic::amdgcn_sffbh:
9975 case Intrinsic::amdgcn_sbfe:
9977 Op.getOperand(2),
Op.getOperand(3));
9978 case Intrinsic::amdgcn_ubfe:
9980 Op.getOperand(2),
Op.getOperand(3));
9981 case Intrinsic::amdgcn_cvt_pkrtz:
9982 case Intrinsic::amdgcn_cvt_pknorm_i16:
9983 case Intrinsic::amdgcn_cvt_pknorm_u16:
9984 case Intrinsic::amdgcn_cvt_pk_i16:
9985 case Intrinsic::amdgcn_cvt_pk_u16: {
9987 EVT VT =
Op.getValueType();
9990 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9992 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9994 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9996 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10002 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
10005 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
10006 return DAG.
getNode(ISD::BITCAST,
DL, VT, Node);
10008 case Intrinsic::amdgcn_fmad_ftz:
10010 Op.getOperand(2),
Op.getOperand(3));
10012 case Intrinsic::amdgcn_if_break:
10014 Op->getOperand(1),
Op->getOperand(2)),
10017 case Intrinsic::amdgcn_groupstaticsize: {
10023 const GlobalValue *GV =
10029 case Intrinsic::amdgcn_is_shared:
10030 case Intrinsic::amdgcn_is_private: {
10033 DAG.
getNode(ISD::BITCAST,
DL, MVT::v2i32,
Op.getOperand(1));
10037 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10041 Subtarget->hasGloballyAddressableScratch()) {
10044 AMDGPU::S_MOV_B32,
DL, MVT::i32,
10045 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10054 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10057 case Intrinsic::amdgcn_perm:
10059 Op.getOperand(2),
Op.getOperand(3));
10060 case Intrinsic::amdgcn_reloc_constant: {
10070 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10071 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10072 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10073 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10074 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10075 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10076 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10077 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10078 if (
Op.getOperand(4).getValueType() == MVT::i32)
10084 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10085 Op.getOperand(3), IndexKeyi32);
10087 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10088 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10089 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10090 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10091 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10092 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10093 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10094 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10095 if (
Op.getOperand(4).getValueType() == MVT::i64)
10101 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10102 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10103 Op.getOperand(6)});
10105 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10106 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10107 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10108 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10109 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10110 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10111 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10114 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
10120 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10121 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10122 IndexKey, Op.getOperand(7),
10123 Op.getOperand(8)});
10125 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10126 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10127 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10128 if (
Op.getOperand(6).getValueType() == MVT::i32)
10134 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10135 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10136 IndexKeyi32, Op.getOperand(7)});
10138 case Intrinsic::amdgcn_addrspacecast_nonnull:
10139 return lowerADDRSPACECAST(
Op, DAG);
10140 case Intrinsic::amdgcn_readlane:
10141 case Intrinsic::amdgcn_readfirstlane:
10142 case Intrinsic::amdgcn_writelane:
10143 case Intrinsic::amdgcn_permlane16:
10144 case Intrinsic::amdgcn_permlanex16:
10145 case Intrinsic::amdgcn_permlane64:
10146 case Intrinsic::amdgcn_set_inactive:
10147 case Intrinsic::amdgcn_set_inactive_chain_arg:
10148 case Intrinsic::amdgcn_mov_dpp8:
10149 case Intrinsic::amdgcn_update_dpp:
10151 case Intrinsic::amdgcn_dead: {
10153 for (
const EVT ValTy :
Op.getNode()->values())
10158 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10160 return lowerImage(
Op, ImageDimIntr, DAG,
false);
10171 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10177 unsigned NewOpcode)
const {
10181 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10182 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10200 M->getMemOperand());
10205 unsigned NewOpcode)
const {
10209 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10210 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10228 M->getMemOperand());
10233 unsigned IntrID =
Op.getConstantOperandVal(1);
10237 case Intrinsic::amdgcn_ds_ordered_add:
10238 case Intrinsic::amdgcn_ds_ordered_swap: {
10243 unsigned IndexOperand =
M->getConstantOperandVal(7);
10244 unsigned WaveRelease =
M->getConstantOperandVal(8);
10245 unsigned WaveDone =
M->getConstantOperandVal(9);
10247 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10248 IndexOperand &= ~0x3f;
10249 unsigned CountDw = 0;
10252 CountDw = (IndexOperand >> 24) & 0xf;
10253 IndexOperand &= ~(0xf << 24);
10255 if (CountDw < 1 || CountDw > 4) {
10258 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
10259 DL.getDebugLoc()));
10264 if (IndexOperand) {
10267 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
10270 if (WaveDone && !WaveRelease) {
10274 Fn,
"ds_ordered_count: wave_done requires wave_release",
10275 DL.getDebugLoc()));
10278 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10279 unsigned ShaderType =
10281 unsigned Offset0 = OrderedCountIndex << 2;
10282 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10285 Offset1 |= (CountDw - 1) << 6;
10288 Offset1 |= ShaderType << 2;
10290 unsigned Offset = Offset0 | (Offset1 << 8);
10297 M->getVTList(),
Ops,
M->getMemoryVT(),
10298 M->getMemOperand());
10300 case Intrinsic::amdgcn_raw_buffer_load:
10301 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10302 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10303 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10304 case Intrinsic::amdgcn_raw_buffer_load_format:
10305 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10306 const bool IsFormat =
10307 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10308 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10310 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10311 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10325 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
10327 case Intrinsic::amdgcn_struct_buffer_load:
10328 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10329 case Intrinsic::amdgcn_struct_buffer_load_format:
10330 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10331 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10332 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10333 const bool IsFormat =
10334 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10335 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10337 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10338 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10353 case Intrinsic::amdgcn_raw_tbuffer_load:
10354 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10356 EVT LoadVT =
Op.getValueType();
10357 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10358 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10377 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10380 case Intrinsic::amdgcn_struct_tbuffer_load:
10381 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10383 EVT LoadVT =
Op.getValueType();
10384 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10385 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10404 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10407 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10408 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10410 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10411 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10412 return lowerStructBufferAtomicIntrin(
Op, DAG,
10414 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10415 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10417 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10418 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10419 return lowerStructBufferAtomicIntrin(
Op, DAG,
10421 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10422 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10424 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10425 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10426 return lowerStructBufferAtomicIntrin(
Op, DAG,
10428 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10429 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10431 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10432 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10434 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10435 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10437 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10438 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10440 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10441 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10443 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10444 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10446 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10447 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10449 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10450 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10452 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10453 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10455 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10456 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10458 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10459 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10461 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10462 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10464 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10465 return lowerRawBufferAtomicIntrin(
Op, DAG,
10467 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10468 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10469 return lowerStructBufferAtomicIntrin(
Op, DAG,
10471 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10472 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10474 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10475 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10477 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10478 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10479 return lowerStructBufferAtomicIntrin(
Op, DAG,
10481 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10482 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10483 return lowerStructBufferAtomicIntrin(
Op, DAG,
10485 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10486 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10487 return lowerStructBufferAtomicIntrin(
Op, DAG,
10489 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10490 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10491 return lowerStructBufferAtomicIntrin(
Op, DAG,
10493 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10494 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10496 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10497 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10499 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10500 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10502 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10503 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10505 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10506 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10508 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10509 return lowerStructBufferAtomicIntrin(
Op, DAG,
10512 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10513 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10514 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
10515 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10529 EVT VT =
Op.getValueType();
10533 Op->getVTList(),
Ops, VT,
10534 M->getMemOperand());
10536 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10537 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10538 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
10539 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
10553 EVT VT =
Op.getValueType();
10557 Op->getVTList(),
Ops, VT,
10558 M->getMemOperand());
10560 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10561 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10563 SDValue NodePtr =
M->getOperand(2);
10564 SDValue RayExtent =
M->getOperand(3);
10565 SDValue InstanceMask =
M->getOperand(4);
10566 SDValue RayOrigin =
M->getOperand(5);
10567 SDValue RayDir =
M->getOperand(6);
10569 SDValue TDescr =
M->getOperand(8);
10574 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10579 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10580 const unsigned NumVDataDwords = 10;
10581 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10583 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10584 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10585 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10589 Ops.push_back(NodePtr);
10592 {DAG.getBitcast(MVT::i32, RayExtent),
10593 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10594 Ops.push_back(RayOrigin);
10595 Ops.push_back(RayDir);
10596 Ops.push_back(Offsets);
10597 Ops.push_back(TDescr);
10598 Ops.push_back(
M->getChain());
10601 MachineMemOperand *MemRef =
M->getMemOperand();
10605 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10607 SDValue NodePtr =
M->getOperand(2);
10608 SDValue RayExtent =
M->getOperand(3);
10609 SDValue RayOrigin =
M->getOperand(4);
10610 SDValue RayDir =
M->getOperand(5);
10611 SDValue RayInvDir =
M->getOperand(6);
10612 SDValue TDescr =
M->getOperand(7);
10619 if (!Subtarget->hasGFX10_AEncoding()) {
10629 const unsigned NumVDataDwords = 4;
10630 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10631 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10632 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10635 const unsigned BaseOpcodes[2][2] = {
10636 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10637 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10638 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10642 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10643 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10644 : AMDGPU::MIMGEncGfx10NSA,
10645 NumVDataDwords, NumVAddrDwords);
10649 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10650 : AMDGPU::MIMGEncGfx10Default,
10651 NumVDataDwords, NumVAddrDwords);
10657 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
10660 if (Lanes[0].getValueSizeInBits() == 32) {
10661 for (
unsigned I = 0;
I < 3; ++
I)
10668 Ops.push_back(Lanes[2]);
10680 if (UseNSA && IsGFX11Plus) {
10681 Ops.push_back(NodePtr);
10683 Ops.push_back(RayOrigin);
10688 for (
unsigned I = 0;
I < 3; ++
I) {
10691 {DirLanes[I], InvDirLanes[I]})));
10695 Ops.push_back(RayDir);
10696 Ops.push_back(RayInvDir);
10703 Ops.push_back(NodePtr);
10706 packLanes(RayOrigin,
true);
10707 packLanes(RayDir,
true);
10708 packLanes(RayInvDir,
false);
10713 if (NumVAddrDwords > 12) {
10715 Ops.append(16 -
Ops.size(), Undef);
10721 Ops.push_back(MergedOps);
10724 Ops.push_back(TDescr);
10726 Ops.push_back(
M->getChain());
10729 MachineMemOperand *MemRef =
M->getMemOperand();
10733 case Intrinsic::amdgcn_global_atomic_fmin_num:
10734 case Intrinsic::amdgcn_global_atomic_fmax_num:
10735 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10736 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10743 unsigned Opcode = 0;
10745 case Intrinsic::amdgcn_global_atomic_fmin_num:
10746 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10747 Opcode = ISD::ATOMIC_LOAD_FMIN;
10750 case Intrinsic::amdgcn_global_atomic_fmax_num:
10751 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10752 Opcode = ISD::ATOMIC_LOAD_FMAX;
10758 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
10759 Ops,
M->getMemOperand());
10761 case Intrinsic::amdgcn_s_get_barrier_state:
10762 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10769 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10770 BarID = (BarID >> 4) & 0x3F;
10771 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10774 Ops.push_back(Chain);
10776 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10777 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10785 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
10793 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10794 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10795 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10799 EVT VT =
Op->getValueType(0);
10805 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10807 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10815SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
10822 EVT VT = VTList.
VTs[0];
10825 bool IsTFE = VTList.
NumVTs == 3;
10828 unsigned NumOpDWords = NumValueDWords + 1;
10830 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
10831 MachineMemOperand *OpDWordsMMO =
10833 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
10834 OpDWordsVT, OpDWordsMMO, DAG);
10839 NumValueDWords == 1
10848 if (!Subtarget->hasDwordx3LoadStores() &&
10849 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10853 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
10855 WidenedMemVT, WidenedMMO);
10865 bool ImageStore)
const {
10875 if (Subtarget->hasUnpackedD16VMem()) {
10889 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10900 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
10906 if ((NumElements % 2) == 1) {
10908 unsigned I = Elts.
size() / 2;
10924 if (NumElements == 3) {
10934 return DAG.
getNode(ISD::BITCAST,
DL, WidenedStoreVT, ZExt);
10945 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
10948 switch (IntrinsicID) {
10949 case Intrinsic::amdgcn_exp_compr: {
10950 if (!Subtarget->hasCompressedExport()) {
10953 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
10965 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src0),
10966 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src1),
10975 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10979 case Intrinsic::amdgcn_struct_tbuffer_store:
10980 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10982 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
10984 VData = handleD16VData(VData, DAG);
10985 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10986 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11004 M->getMemoryVT(),
M->getMemOperand());
11007 case Intrinsic::amdgcn_raw_tbuffer_store:
11008 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11010 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11012 VData = handleD16VData(VData, DAG);
11013 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11014 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11032 M->getMemoryVT(),
M->getMemOperand());
11035 case Intrinsic::amdgcn_raw_buffer_store:
11036 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11037 case Intrinsic::amdgcn_raw_buffer_store_format:
11038 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11039 const bool IsFormat =
11040 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11041 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11048 VData = handleD16VData(VData, DAG);
11058 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11059 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11079 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
11082 M->getMemoryVT(),
M->getMemOperand());
11085 case Intrinsic::amdgcn_struct_buffer_store:
11086 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11087 case Intrinsic::amdgcn_struct_buffer_store_format:
11088 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11089 const bool IsFormat =
11090 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11091 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11099 VData = handleD16VData(VData, DAG);
11109 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11110 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11129 EVT VDataType = VData.getValueType().getScalarType();
11131 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
11134 M->getMemoryVT(),
M->getMemOperand());
11136 case Intrinsic::amdgcn_raw_buffer_load_lds:
11137 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11138 case Intrinsic::amdgcn_struct_buffer_load_lds:
11139 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11140 if (!Subtarget->hasVMemToLDSLoad())
11144 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11145 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11146 unsigned OpOffset = HasVIndex ? 1 : 0;
11147 SDValue VOffset =
Op.getOperand(5 + OpOffset);
11149 unsigned Size =
Op->getConstantOperandVal(4);
11155 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11156 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11157 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11158 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11161 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11162 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11163 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11164 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11167 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11168 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11169 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11170 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11173 if (!Subtarget->hasLDSLoadB96_B128())
11175 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11176 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11177 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11178 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11181 if (!Subtarget->hasLDSLoadB96_B128())
11183 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11184 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11185 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11186 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11194 if (HasVIndex && HasVOffset)
11198 else if (HasVIndex)
11199 Ops.push_back(
Op.getOperand(5));
11200 else if (HasVOffset)
11201 Ops.push_back(VOffset);
11203 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11204 Ops.push_back(Rsrc);
11205 Ops.push_back(
Op.getOperand(6 + OpOffset));
11206 Ops.push_back(
Op.getOperand(7 + OpOffset));
11208 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
11221 MachineMemOperand *LoadMMO =
M->getMemOperand();
11226 MachinePointerInfo StorePtrI = LoadPtrI;
11250 case Intrinsic::amdgcn_load_to_lds:
11251 case Intrinsic::amdgcn_global_load_lds: {
11252 if (!Subtarget->hasVMemToLDSLoad())
11256 unsigned Size =
Op->getConstantOperandVal(4);
11261 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11264 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11267 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11270 if (!Subtarget->hasLDSLoadB96_B128())
11272 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11275 if (!Subtarget->hasLDSLoadB96_B128())
11277 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11293 if (
LHS->isDivergent())
11297 RHS.getOperand(0).getValueType() == MVT::i32) {
11300 VOffset =
RHS.getOperand(0);
11304 Ops.push_back(Addr);
11312 Ops.push_back(VOffset);
11315 Ops.push_back(
Op.getOperand(5));
11316 Ops.push_back(
Op.getOperand(6));
11321 MachineMemOperand *LoadMMO =
M->getMemOperand();
11323 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
11324 MachinePointerInfo StorePtrI = LoadPtrI;
11343 case Intrinsic::amdgcn_end_cf:
11345 Op->getOperand(2), Chain),
11347 case Intrinsic::amdgcn_s_barrier_init:
11348 case Intrinsic::amdgcn_s_barrier_signal_var: {
11355 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11356 ? AMDGPU::S_BARRIER_INIT_M0
11357 : AMDGPU::S_BARRIER_SIGNAL_M0;
11372 constexpr unsigned ShAmt = 16;
11379 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11384 case Intrinsic::amdgcn_s_barrier_join: {
11393 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11396 unsigned BarID = (BarVal >> 4) & 0x3F;
11399 Ops.push_back(Chain);
11401 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11411 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11417 case Intrinsic::amdgcn_s_prefetch_data: {
11420 return Op.getOperand(0);
11423 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11425 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
11432 Op->getVTList(),
Ops,
M->getMemoryVT(),
11433 M->getMemOperand());
11435 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11436 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11437 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11446 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11448 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11478std::pair<SDValue, SDValue>
11508 unsigned Overflow = ImmOffset & ~MaxImm;
11509 ImmOffset -= Overflow;
11510 if ((int32_t)Overflow < 0) {
11511 Overflow += ImmOffset;
11516 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
11535void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
11537 Align Alignment)
const {
11539 SDLoc
DL(CombinedOffset);
11541 uint32_t
Imm =
C->getZExtValue();
11542 uint32_t SOffset, ImmOffset;
11543 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11553 uint32_t SOffset, ImmOffset;
11556 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
11564 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11573SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
11576 return MaybePointer;
11590 SDValue NumRecords =
Op->getOperand(3);
11593 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11596 std::optional<uint32_t> ConstStride = std::nullopt;
11598 ConstStride = ConstNode->getZExtValue();
11601 if (!ConstStride || *ConstStride != 0) {
11604 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
11615 NewHighHalf, NumRecords, Flags);
11616 SDValue RsrcPtr = DAG.
getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11625 bool IsTFE)
const {
11634 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
11649 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
11653 LoadVal = DAG.
getNode(ISD::BITCAST,
DL, LoadVT, LoadVal);
11663 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11667 Ops[1] = BufferStoreExt;
11672 M->getMemOperand());
11697 DAGCombinerInfo &DCI)
const {
11698 SelectionDAG &DAG = DCI.DAG;
11713 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
11720 "unexpected vector extload");
11733 "unexpected fp extload");
11751 DCI.AddToWorklist(Cvt.
getNode());
11756 DCI.AddToWorklist(Cvt.
getNode());
11759 Cvt = DAG.
getNode(ISD::BITCAST, SL, VT, Cvt);
11767 if (
Info.isEntryFunction())
11768 return Info.getUserSGPRInfo().hasFlatScratchInit();
11776 EVT MemVT =
Load->getMemoryVT();
11777 MachineMemOperand *MMO =
Load->getMemOperand();
11789 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11817 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
11818 "Custom lowering for non-i32 vectors hasn't been implemented.");
11821 unsigned AS =
Load->getAddressSpace();
11828 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
11832 !Subtarget->hasMultiDwordFlatScratchAddressing())
11842 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
11845 Alignment >=
Align(4) && NumElements < 32) {
11847 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11859 if (NumElements > 4)
11862 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11872 switch (Subtarget->getMaxPrivateElementSize()) {
11878 if (NumElements > 2)
11883 if (NumElements > 4)
11886 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11895 auto Flags =
Load->getMemOperand()->getFlags();
11897 Load->getAlign(), Flags, &
Fast) &&
11906 MemVT, *
Load->getMemOperand())) {
11915 EVT VT =
Op.getValueType();
11942 return DAG.
getNode(ISD::BITCAST,
DL, VT, Res);
11952 EVT VT =
Op.getValueType();
11953 const SDNodeFlags
Flags =
Op->getFlags();
11955 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
11961 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11964 if (CLHS->isExactlyValue(1.0)) {
11981 if (CLHS->isExactlyValue(-1.0)) {
11990 if (!AllowInaccurateRcp &&
11991 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
12005 EVT VT =
Op.getValueType();
12006 const SDNodeFlags
Flags =
Op->getFlags();
12008 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
12009 if (!AllowInaccurateDiv)
12030 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
12044 return DAG.
getNode(Opcode, SL, VTList,
12053 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
12067 return DAG.
getNode(Opcode, SL, VTList,
12073 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12074 return FastLowered;
12077 EVT VT =
Op.getValueType();
12084 if (VT == MVT::bf16) {
12107 unsigned FMADOpCode =
12109 SDValue NegRHSExt = DAG.
getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12114 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12116 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
12117 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12123 Tmp = DAG.
getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12133 SDNodeFlags
Flags =
Op->getFlags();
12140 const APFloat K0Val(0x1p+96f);
12143 const APFloat K1Val(0x1p-32f);
12170 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
12171 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
12172 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12177 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12178 return FastLowered;
12184 SDNodeFlags
Flags =
Op->getFlags();
12185 Flags.setNoFPExcept(
true);
12193 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
12204 DAG.
getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12206 using namespace AMDGPU::Hwreg;
12207 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12211 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
12212 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
12215 const bool HasDynamicDenormals =
12221 if (!PreservesDenormals) {
12226 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12229 if (HasDynamicDenormals) {
12233 SavedDenormMode =
SDValue(GetReg, 0);
12239 SDNode *EnableDenorm;
12240 if (Subtarget->hasDenormModeInst()) {
12241 const SDValue EnableDenormValue =
12248 const SDValue EnableDenormValue =
12250 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12251 {EnableDenormValue,
BitField, Glue});
12261 ApproxRcp, One, NegDivScale0, Flags);
12264 ApproxRcp, Fma0, Flags);
12270 NumeratorScaled,
Mul, Flags);
12276 NumeratorScaled, Fma3, Flags);
12278 if (!PreservesDenormals) {
12279 SDNode *DisableDenorm;
12280 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12284 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12290 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
12291 const SDValue DisableDenormValue =
12292 HasDynamicDenormals
12297 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12308 {Fma4, Fma1, Fma3, Scale},
Flags);
12314 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
12315 return FastLowered;
12323 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
12327 SDValue NegDivScale0 = DAG.
getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12347 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12356 SDValue Scale0BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12357 SDValue Scale1BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12383 EVT VT =
Op.getValueType();
12385 if (VT == MVT::f32)
12386 return LowerFDIV32(
Op, DAG);
12388 if (VT == MVT::f64)
12389 return LowerFDIV64(
Op, DAG);
12391 if (VT == MVT::f16 || VT == MVT::bf16)
12392 return LowerFDIV16(
Op, DAG);
12401 EVT ResultExpVT =
Op->getValueType(1);
12402 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12412 if (Subtarget->hasFractBug()) {
12430 EVT VT =
Store->getMemoryVT();
12432 if (VT == MVT::i1) {
12436 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
12440 Store->getValue().getValueType().getScalarType() == MVT::i32);
12442 unsigned AS =
Store->getAddressSpace();
12450 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12454 !Subtarget->hasMultiDwordFlatScratchAddressing())
12461 if (NumElements > 4)
12464 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12468 VT, *
Store->getMemOperand()))
12474 switch (Subtarget->getMaxPrivateElementSize()) {
12478 if (NumElements > 2)
12482 if (NumElements > 4 ||
12483 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12491 auto Flags =
Store->getMemOperand()->getFlags();
12510 assert(!Subtarget->has16BitInsts());
12511 SDNodeFlags
Flags =
Op->getFlags();
12513 DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32,
Op.getOperand(0), Flags);
12525 SDNodeFlags
Flags =
Op->getFlags();
12526 MVT VT =
Op.getValueType().getSimpleVT();
12556 SDValue SqrtSNextDown = DAG.
getNode(ISD::BITCAST,
DL, VT, SqrtSNextDownInt);
12559 DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextDown, Flags);
12568 SDValue NegSqrtSNextUp = DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextUp, Flags);
12634 SDNodeFlags
Flags =
Op->getFlags();
12680 SqrtRet = DAG.
getNode(ISD::FLDEXP,
DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12697 EVT VT =
Op.getValueType();
12707 if (Subtarget->hasTrigReducedRange()) {
12714 switch (
Op.getOpcode()) {
12741 EVT VT =
Op.getValueType();
12749 Op->getVTList(),
Ops, VT,
12758SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
12759 DAGCombinerInfo &DCI)
const {
12760 EVT VT =
N->getValueType(0);
12762 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12765 SelectionDAG &DAG = DCI.DAG;
12769 EVT SrcVT = Src.getValueType();
12775 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12778 DCI.AddToWorklist(Cvt.
getNode());
12781 if (ScalarVT != MVT::f32) {
12793 DAGCombinerInfo &DCI)
const {
12800 if (SignOp.
getOpcode() == ISD::FP_EXTEND ||
12804 SelectionDAG &DAG = DCI.DAG;
12823 for (
unsigned I = 0;
I != NumElts; ++
I) {
12847 if (NewElts.
size() == 1)
12869 for (
unsigned I = 0;
I != NumElts; ++
I) {
12904SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
12906 DAGCombinerInfo &DCI)
const {
12923 SelectionDAG &DAG = DCI.DAG;
12936 AM.BaseOffs =
Offset.getSExtValue();
12941 EVT VT =
N->getValueType(0);
12947 Flags.setNoUnsignedWrap(
12948 N->getFlags().hasNoUnsignedWrap() &&
12960 switch (
N->getOpcode()) {
12971 DAGCombinerInfo &DCI)
const {
12972 SelectionDAG &DAG = DCI.DAG;
12979 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
12980 N->getMemoryVT(), DCI);
12984 NewOps[PtrIdx] = NewPtr;
12993 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
12994 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13003SDValue SITargetLowering::splitBinaryBitConstantOp(
13007 uint32_t ValLo =
Lo_32(Val);
13008 uint32_t ValHi =
Hi_32(Val);
13015 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
13029 if (V.getValueType() != MVT::i1)
13031 switch (V.getOpcode()) {
13048 return V.getResNo() == 1;
13050 unsigned IntrinsicID = V.getConstantOperandVal(0);
13051 switch (IntrinsicID) {
13052 case Intrinsic::amdgcn_is_shared:
13053 case Intrinsic::amdgcn_is_private:
13070 if (!(
C & 0x000000ff))
13071 ZeroByteMask |= 0x000000ff;
13072 if (!(
C & 0x0000ff00))
13073 ZeroByteMask |= 0x0000ff00;
13074 if (!(
C & 0x00ff0000))
13075 ZeroByteMask |= 0x00ff0000;
13076 if (!(
C & 0xff000000))
13077 ZeroByteMask |= 0xff000000;
13078 uint32_t NonZeroByteMask = ~ZeroByteMask;
13079 if ((NonZeroByteMask &
C) != NonZeroByteMask)
13092 assert(V.getValueSizeInBits() == 32);
13094 if (V.getNumOperands() != 2)
13103 switch (V.getOpcode()) {
13108 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13113 return (0x03020100 & ~ConstMask) | ConstMask;
13120 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
13126 return uint32_t(0x0c0c0c0c03020100ull >>
C);
13133 DAGCombinerInfo &DCI)
const {
13134 if (DCI.isBeforeLegalize())
13137 SelectionDAG &DAG = DCI.DAG;
13138 EVT VT =
N->getValueType(0);
13143 if (VT == MVT::i64 && CRHS) {
13145 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
13149 if (CRHS && VT == MVT::i32) {
13159 unsigned Shift = CShift->getZExtValue();
13161 unsigned Offset = NB + Shift;
13162 if ((
Offset & (Bits - 1)) == 0) {
13186 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13201 if (
Y.getOpcode() != ISD::FABS ||
Y.getOperand(0) !=
X ||
13206 if (
X !=
LHS.getOperand(1))
13210 const ConstantFPSDNode *C1 =
13244 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
13245 LHS.getOperand(0) ==
LHS.getOperand(1))) {
13247 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
13248 :
Mask->getZExtValue() & OrdMask;
13269 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13272 if (LHSMask != ~0u && RHSMask != ~0u) {
13275 if (LHSMask > RHSMask) {
13282 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13283 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13286 if (!(LHSUsedLanes & RHSUsedLanes) &&
13289 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13295 uint32_t
Mask = LHSMask & RHSMask;
13296 for (
unsigned I = 0;
I < 32;
I += 8) {
13297 uint32_t ByteSel = 0xff <<
I;
13298 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13299 Mask &= (0x0c <<
I) & 0xffffffff;
13304 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
13357static const std::optional<ByteProvider<SDValue>>
13359 unsigned Depth = 0) {
13362 return std::nullopt;
13364 if (
Op.getValueSizeInBits() < 8)
13365 return std::nullopt;
13367 if (
Op.getValueType().isVector())
13370 switch (
Op->getOpcode()) {
13382 NarrowVT = VTSign->getVT();
13385 return std::nullopt;
13388 if (SrcIndex >= NarrowByteWidth)
13389 return std::nullopt;
13397 return std::nullopt;
13399 uint64_t BitShift = ShiftOp->getZExtValue();
13401 if (BitShift % 8 != 0)
13402 return std::nullopt;
13404 SrcIndex += BitShift / 8;
13422static const std::optional<ByteProvider<SDValue>>
13424 unsigned StartingIndex = 0) {
13428 return std::nullopt;
13430 unsigned BitWidth =
Op.getScalarValueSizeInBits();
13432 return std::nullopt;
13434 return std::nullopt;
13436 bool IsVec =
Op.getValueType().isVector();
13437 switch (
Op.getOpcode()) {
13440 return std::nullopt;
13445 return std::nullopt;
13449 return std::nullopt;
13452 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
13453 return std::nullopt;
13454 if (!
LHS ||
LHS->isConstantZero())
13456 if (!
RHS ||
RHS->isConstantZero())
13458 return std::nullopt;
13463 return std::nullopt;
13467 return std::nullopt;
13469 uint32_t BitMask = BitMaskOp->getZExtValue();
13471 uint32_t IndexMask = 0xFF << (Index * 8);
13473 if ((IndexMask & BitMask) != IndexMask) {
13476 if (IndexMask & BitMask)
13477 return std::nullopt;
13486 return std::nullopt;
13490 if (!ShiftOp ||
Op.getValueType().isVector())
13491 return std::nullopt;
13493 uint64_t BitsProvided =
Op.getValueSizeInBits();
13494 if (BitsProvided % 8 != 0)
13495 return std::nullopt;
13497 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13499 return std::nullopt;
13501 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13502 uint64_t ByteShift = BitShift / 8;
13504 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13505 uint64_t BytesProvided = BitsProvided / 8;
13506 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13507 NewIndex %= BytesProvided;
13514 return std::nullopt;
13518 return std::nullopt;
13520 uint64_t BitShift = ShiftOp->getZExtValue();
13522 return std::nullopt;
13524 auto BitsProvided =
Op.getScalarValueSizeInBits();
13525 if (BitsProvided % 8 != 0)
13526 return std::nullopt;
13528 uint64_t BytesProvided = BitsProvided / 8;
13529 uint64_t ByteShift = BitShift / 8;
13534 return BytesProvided - ByteShift > Index
13542 return std::nullopt;
13546 return std::nullopt;
13548 uint64_t BitShift = ShiftOp->getZExtValue();
13549 if (BitShift % 8 != 0)
13550 return std::nullopt;
13551 uint64_t ByteShift = BitShift / 8;
13557 return Index < ByteShift
13560 Depth + 1, StartingIndex);
13569 return std::nullopt;
13577 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13579 if (NarrowBitWidth % 8 != 0)
13580 return std::nullopt;
13581 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13583 if (Index >= NarrowByteWidth)
13585 ? std::optional<ByteProvider<SDValue>>(
13593 return std::nullopt;
13597 if (NarrowByteWidth >= Index) {
13602 return std::nullopt;
13609 return std::nullopt;
13615 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13616 if (NarrowBitWidth % 8 != 0)
13617 return std::nullopt;
13618 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13623 if (Index >= NarrowByteWidth) {
13625 ? std::optional<ByteProvider<SDValue>>(
13630 if (NarrowByteWidth > Index) {
13634 return std::nullopt;
13639 return std::nullopt;
13642 Depth + 1, StartingIndex);
13648 return std::nullopt;
13649 auto VecIdx = IdxOp->getZExtValue();
13650 auto ScalarSize =
Op.getScalarValueSizeInBits();
13651 if (ScalarSize < 32)
13652 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13654 StartingIndex, Index);
13659 return std::nullopt;
13663 return std::nullopt;
13666 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13667 if (IdxMask > 0x07 && IdxMask != 0x0c)
13668 return std::nullopt;
13670 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13671 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13673 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
13679 return std::nullopt;
13694 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
13701 auto MemVT = L->getMemoryVT();
13704 return L->getMemoryVT().getSizeInBits() == 16;
13714 int Low8 = Mask & 0xff;
13715 int Hi8 = (Mask & 0xff00) >> 8;
13717 assert(Low8 < 8 && Hi8 < 8);
13719 bool IsConsecutive = (Hi8 - Low8 == 1);
13724 bool Is16Aligned = !(Low8 % 2);
13726 return IsConsecutive && Is16Aligned;
13734 int Low16 = PermMask & 0xffff;
13735 int Hi16 = (PermMask & 0xffff0000) >> 16;
13745 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13747 if (!OtherOpIs16Bit)
13755 unsigned DWordOffset) {
13760 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13765 if (Src.getValueType().isVector()) {
13766 auto ScalarTySize = Src.getScalarValueSizeInBits();
13767 auto ScalarTy = Src.getValueType().getScalarType();
13768 if (ScalarTySize == 32) {
13772 if (ScalarTySize > 32) {
13775 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13776 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13783 assert(ScalarTySize < 32);
13784 auto NumElements =
TypeSize / ScalarTySize;
13785 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13786 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13787 auto NumElementsIn32 = 32 / ScalarTySize;
13788 auto NumAvailElements = DWordOffset < Trunc32Elements
13790 : NumElements - NormalizedTrunc;
13803 auto ShiftVal = 32 * DWordOffset;
13811 [[maybe_unused]]
EVT VT =
N->getValueType(0);
13816 for (
int i = 0; i < 4; i++) {
13818 std::optional<ByteProvider<SDValue>>
P =
13821 if (!
P ||
P->isConstantZero())
13826 if (PermNodes.
size() != 4)
13829 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13830 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13832 for (
size_t i = 0; i < PermNodes.
size(); i++) {
13833 auto PermOp = PermNodes[i];
13836 int SrcByteAdjust = 4;
13840 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13841 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13843 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13844 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13848 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13849 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13852 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13854 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13857 SDValue Op = *PermNodes[FirstSrc.first].Src;
13859 assert(
Op.getValueSizeInBits() == 32);
13863 int Low16 = PermMask & 0xffff;
13864 int Hi16 = (PermMask & 0xffff0000) >> 16;
13866 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13867 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13870 if (WellFormedLow && WellFormedHi)
13874 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
13883 assert(
Op.getValueType().isByteSized() &&
13901 DAGCombinerInfo &DCI)
const {
13902 SelectionDAG &DAG = DCI.DAG;
13906 EVT VT =
N->getValueType(0);
13907 if (VT == MVT::i1) {
13912 if (Src !=
RHS.getOperand(0))
13917 if (!CLHS || !CRHS)
13921 static const uint32_t MaxMask = 0x3ff;
13941 Sel |=
LHS.getConstantOperandVal(2);
13950 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13954 auto usesCombinedOperand = [](SDNode *OrUse) {
13956 if (OrUse->getOpcode() != ISD::BITCAST ||
13957 !OrUse->getValueType(0).isVector())
13961 for (
auto *VUser : OrUse->users()) {
13962 if (!VUser->getValueType(0).isVector())
13969 if (VUser->getOpcode() == VectorwiseOp)
13975 if (!
any_of(
N->users(), usesCombinedOperand))
13981 if (LHSMask != ~0u && RHSMask != ~0u) {
13984 if (LHSMask > RHSMask) {
13991 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13992 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13995 if (!(LHSUsedLanes & RHSUsedLanes) &&
13998 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14000 LHSMask &= ~RHSUsedLanes;
14001 RHSMask &= ~LHSUsedLanes;
14003 LHSMask |= LHSUsedLanes & 0x04040404;
14005 uint32_t Sel = LHSMask | RHSMask;
14013 if (LHSMask == ~0u || RHSMask == ~0u) {
14054 return IdentitySrc;
14060 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14075 if (SrcVT == MVT::i32) {
14080 DCI.AddToWorklist(LowOr.
getNode());
14081 DCI.AddToWorklist(HiBits.getNode());
14085 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14092 N->getOperand(0), CRHS))
14100 DAGCombinerInfo &DCI)
const {
14101 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
14108 SelectionDAG &DAG = DCI.DAG;
14110 EVT VT =
N->getValueType(0);
14111 if (CRHS && VT == MVT::i64) {
14113 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
14120 unsigned Opc =
LHS.getOpcode();
14144 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(1));
14146 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(2));
14150 LHS->getOperand(0), FNegLHS, FNegRHS);
14151 return DAG.
getNode(ISD::BITCAST,
DL, VT, NewSelect);
14159 DAGCombinerInfo &DCI)
const {
14160 if (!Subtarget->has16BitInsts() ||
14164 EVT VT =
N->getValueType(0);
14165 if (VT != MVT::i32)
14169 if (Src.getValueType() != MVT::i16)
14176SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
14177 DAGCombinerInfo &DCI)
const {
14184 VTSign->getVT() == MVT::i8) ||
14186 VTSign->getVT() == MVT::i16))) {
14187 assert(Subtarget->hasScalarSubwordLoads() &&
14188 "s_buffer_load_{u8, i8} are supported "
14189 "in GFX12 (or newer) architectures.");
14190 EVT VT = Src.getValueType();
14195 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14202 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14203 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14208 VTSign->getVT() == MVT::i8) ||
14210 VTSign->getVT() == MVT::i16)) &&
14219 Src.getOperand(6), Src.getOperand(7)};
14222 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14226 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14227 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14228 return DCI.DAG.getMergeValues(
14229 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
14235 DAGCombinerInfo &DCI)
const {
14236 SelectionDAG &DAG = DCI.DAG;
14243 if (
N->getOperand(0).isUndef())
14250 DAGCombinerInfo &DCI)
const {
14251 EVT VT =
N->getValueType(0);
14266 if ((VT == MVT::f16 && N0.
getOpcode() == ISD::FSQRT) &&
14276 unsigned MaxDepth)
const {
14277 unsigned Opcode =
Op.getOpcode();
14282 const auto &
F = CFP->getValueAPF();
14283 if (
F.isNaN() &&
F.isSignaling())
14285 if (!
F.isDenormal())
14311 case ISD::FP_EXTEND:
14312 case ISD::FP16_TO_FP:
14313 case ISD::FP_TO_FP16:
14314 case ISD::BF16_TO_FP:
14315 case ISD::FP_TO_BF16:
14348 if (
Op.getValueType() == MVT::i32) {
14354 if (RHS->getZExtValue() == 0xffff0000) {
14364 return Op.getValueType().getScalarType() != MVT::f16;
14368 case ISD::FMINNUM_IEEE:
14369 case ISD::FMAXNUM_IEEE:
14370 case ISD::FMINIMUM:
14371 case ISD::FMAXIMUM:
14372 case ISD::FMINIMUMNUM:
14373 case ISD::FMAXIMUMNUM:
14385 if (Subtarget->supportsMinMaxDenormModes() ||
14395 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
14407 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
14434 if (
Op.getValueType() == MVT::i16) {
14437 TruncSrc.
getOpcode() == ISD::BITCAST &&
14445 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
14447 switch (IntrinsicID) {
14448 case Intrinsic::amdgcn_cvt_pkrtz:
14449 case Intrinsic::amdgcn_cubeid:
14450 case Intrinsic::amdgcn_frexp_mant:
14451 case Intrinsic::amdgcn_fdot2:
14452 case Intrinsic::amdgcn_rcp:
14453 case Intrinsic::amdgcn_rsq:
14454 case Intrinsic::amdgcn_rsq_clamp:
14455 case Intrinsic::amdgcn_rcp_legacy:
14456 case Intrinsic::amdgcn_rsq_legacy:
14457 case Intrinsic::amdgcn_trig_preop:
14458 case Intrinsic::amdgcn_tanh:
14459 case Intrinsic::amdgcn_log:
14460 case Intrinsic::amdgcn_exp2:
14461 case Intrinsic::amdgcn_sqrt:
14479 unsigned MaxDepth)
const {
14482 unsigned Opcode =
MI->getOpcode();
14484 if (Opcode == AMDGPU::G_FCANONICALIZE)
14487 std::optional<FPValueAndVReg> FCR;
14490 if (FCR->Value.isSignaling())
14492 if (!FCR->Value.isDenormal())
14503 case AMDGPU::G_FADD:
14504 case AMDGPU::G_FSUB:
14505 case AMDGPU::G_FMUL:
14506 case AMDGPU::G_FCEIL:
14507 case AMDGPU::G_FFLOOR:
14508 case AMDGPU::G_FRINT:
14509 case AMDGPU::G_FNEARBYINT:
14510 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14511 case AMDGPU::G_INTRINSIC_TRUNC:
14512 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14513 case AMDGPU::G_FMA:
14514 case AMDGPU::G_FMAD:
14515 case AMDGPU::G_FSQRT:
14516 case AMDGPU::G_FDIV:
14517 case AMDGPU::G_FREM:
14518 case AMDGPU::G_FPOW:
14519 case AMDGPU::G_FPEXT:
14520 case AMDGPU::G_FLOG:
14521 case AMDGPU::G_FLOG2:
14522 case AMDGPU::G_FLOG10:
14523 case AMDGPU::G_FPTRUNC:
14524 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14525 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14526 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14527 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14528 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14530 case AMDGPU::G_FNEG:
14531 case AMDGPU::G_FABS:
14532 case AMDGPU::G_FCOPYSIGN:
14534 case AMDGPU::G_FMINNUM:
14535 case AMDGPU::G_FMAXNUM:
14536 case AMDGPU::G_FMINNUM_IEEE:
14537 case AMDGPU::G_FMAXNUM_IEEE:
14538 case AMDGPU::G_FMINIMUM:
14539 case AMDGPU::G_FMAXIMUM:
14540 case AMDGPU::G_FMINIMUMNUM:
14541 case AMDGPU::G_FMAXIMUMNUM: {
14542 if (Subtarget->supportsMinMaxDenormModes() ||
14549 case AMDGPU::G_BUILD_VECTOR:
14554 case AMDGPU::G_INTRINSIC:
14555 case AMDGPU::G_INTRINSIC_CONVERGENT:
14557 case Intrinsic::amdgcn_fmul_legacy:
14558 case Intrinsic::amdgcn_fmad_ftz:
14559 case Intrinsic::amdgcn_sqrt:
14560 case Intrinsic::amdgcn_fmed3:
14561 case Intrinsic::amdgcn_sin:
14562 case Intrinsic::amdgcn_cos:
14563 case Intrinsic::amdgcn_log:
14564 case Intrinsic::amdgcn_exp2:
14565 case Intrinsic::amdgcn_log_clamp:
14566 case Intrinsic::amdgcn_rcp:
14567 case Intrinsic::amdgcn_rcp_legacy:
14568 case Intrinsic::amdgcn_rsq:
14569 case Intrinsic::amdgcn_rsq_clamp:
14570 case Intrinsic::amdgcn_rsq_legacy:
14571 case Intrinsic::amdgcn_div_scale:
14572 case Intrinsic::amdgcn_div_fmas:
14573 case Intrinsic::amdgcn_div_fixup:
14574 case Intrinsic::amdgcn_fract:
14575 case Intrinsic::amdgcn_cvt_pkrtz:
14576 case Intrinsic::amdgcn_cubeid:
14577 case Intrinsic::amdgcn_cubema:
14578 case Intrinsic::amdgcn_cubesc:
14579 case Intrinsic::amdgcn_cubetc:
14580 case Intrinsic::amdgcn_frexp_mant:
14581 case Intrinsic::amdgcn_fdot2:
14582 case Intrinsic::amdgcn_trig_preop:
14583 case Intrinsic::amdgcn_tanh:
14602 if (
C.isDenormal()) {
14616 if (
C.isSignaling()) {
14639SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
14640 DAGCombinerInfo &DCI)
const {
14641 SelectionDAG &DAG = DCI.DAG;
14643 EVT VT =
N->getValueType(0);
14652 EVT VT =
N->getValueType(0);
14653 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
14669 EVT EltVT =
Lo.getValueType();
14672 for (
unsigned I = 0;
I != 2; ++
I) {
14676 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14677 }
else if (
Op.isUndef()) {
14711 case ISD::FMAXNUM_IEEE:
14712 case ISD::FMAXIMUMNUM:
14714 case ISD::FMAXIMUM:
14721 case ISD::FMINNUM_IEEE:
14722 case ISD::FMINIMUMNUM:
14724 case ISD::FMINIMUM:
14750 if (!MinK || !MaxK)
14763 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14764 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14823 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
14829 if (
Info->getMode().DX10Clamp) {
14838 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14866 case ISD::FMINNUM_IEEE:
14867 case ISD::FMAXNUM_IEEE:
14868 case ISD::FMINIMUMNUM:
14869 case ISD::FMAXIMUMNUM:
14872 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
14874 case ISD::FMINIMUM:
14875 case ISD::FMAXIMUM:
14883 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
14892 DAGCombinerInfo &DCI)
const {
14893 SelectionDAG &DAG = DCI.DAG;
14925 if (
SDValue Med3 = performIntMed3ImmCombine(
14930 if (
SDValue Med3 = performIntMed3ImmCombine(
14936 if (
SDValue Med3 = performIntMed3ImmCombine(
14941 if (
SDValue Med3 = performIntMed3ImmCombine(
14951 if (((
Opc == ISD::FMINNUM && Op0.
getOpcode() == ISD::FMAXNUM) ||
14952 (
Opc == ISD::FMINNUM_IEEE && Op0.
getOpcode() == ISD::FMAXNUM_IEEE) ||
14953 (
Opc == ISD::FMINIMUMNUM && Op0.
getOpcode() == ISD::FMAXIMUMNUM) ||
14956 (VT == MVT::f32 || VT == MVT::f64 ||
14957 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14958 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14959 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14960 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14962 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1))
14969 const SDNodeFlags
Flags =
N->getFlags();
14970 if ((
Opc == ISD::FMINIMUM ||
Opc == ISD::FMAXIMUM) &&
14971 !Subtarget->hasIEEEMinimumMaximumInsts() &&
Flags.hasNoNaNs()) {
14973 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14974 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
14984 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
14985 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
14994 DAGCombinerInfo &DCI)
const {
14995 EVT VT =
N->getValueType(0);
14999 SelectionDAG &DAG = DCI.DAG;
15014 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15018 if (
Info->getMode().DX10Clamp) {
15038 DAGCombinerInfo &DCI)
const {
15042 return DCI.DAG.getUNDEF(
N->getValueType(0));
15050 bool IsDivergentIdx,
15055 unsigned VecSize = EltSize * NumElem;
15058 if (VecSize <= 64 && EltSize < 32)
15067 if (IsDivergentIdx)
15071 unsigned NumInsts = NumElem +
15072 ((EltSize + 31) / 32) * NumElem ;
15076 if (Subtarget->useVGPRIndexMode())
15077 return NumInsts <= 16;
15081 if (Subtarget->hasMovrel())
15082 return NumInsts <= 15;
15088 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
15103SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
15104 DAGCombinerInfo &DCI)
const {
15110 EVT ResVT =
N->getValueType(0);
15134 if (!
C ||
C->getZExtValue() != 0x1f)
15150 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15169 case ISD::FMAXNUM_IEEE:
15170 case ISD::FMINNUM_IEEE:
15171 case ISD::FMAXIMUM:
15172 case ISD::FMINIMUM: {
15178 DCI.AddToWorklist(Elt0.
getNode());
15179 DCI.AddToWorklist(Elt1.
getNode());
15206 if (Vec.
getOpcode() == ISD::BITCAST && VecVT == MVT::v2i32 && Idx) {
15210 if (KImm && KImm->getValueType(0).getSizeInBits() == 64) {
15211 uint64_t KImmValue = KImm->getZExtValue();
15213 (KImmValue >> (32 * Idx->getZExtValue())) & 0xffffffff, SL, MVT::i32);
15216 if (KFPImm && KFPImm->getValueType(0).getSizeInBits() == 64) {
15217 uint64_t KFPImmValue =
15218 KFPImm->getValueAPF().bitcastToAPInt().getZExtValue();
15219 return DAG.
getConstant((KFPImmValue >> (32 * Idx->getZExtValue())) &
15225 if (!DCI.isBeforeLegalize())
15232 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15235 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15236 unsigned EltIdx = BitIndex / 32;
15237 unsigned LeftoverBitIdx = BitIndex % 32;
15241 DCI.AddToWorklist(Cast.
getNode());
15245 DCI.AddToWorklist(Elt.
getNode());
15248 DCI.AddToWorklist(Srl.
getNode());
15252 DCI.AddToWorklist(Trunc.
getNode());
15254 if (VecEltVT == ResVT) {
15255 return DAG.
getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15266SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
15267 DAGCombinerInfo &DCI)
const {
15278 SelectionDAG &DAG = DCI.DAG;
15297 if (Src.getOpcode() == ISD::FP_EXTEND &&
15298 Src.getOperand(0).getValueType() == MVT::f16) {
15299 return Src.getOperand(0);
15303 APFloat Val = CFP->getValueAPF();
15304 bool LosesInfo =
true;
15314 DAGCombinerInfo &DCI)
const {
15315 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15316 "combine only useful on gfx8");
15318 SDValue TruncSrc =
N->getOperand(0);
15319 EVT VT =
N->getValueType(0);
15320 if (VT != MVT::f16)
15327 SelectionDAG &DAG = DCI.DAG;
15355 return DAG.
getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15358unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
15360 const SDNode *N1)
const {
15365 if (((VT == MVT::f32 &&
15367 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15387 EVT VT =
N->getValueType(0);
15388 if (VT != MVT::i32 && VT != MVT::i64)
15394 unsigned Opc =
N->getOpcode();
15449 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
15468 DAGCombinerInfo &DCI)
const {
15471 SelectionDAG &DAG = DCI.DAG;
15472 EVT VT =
N->getValueType(0);
15482 if (!
N->isDivergent() && Subtarget->hasSMulHi())
15486 if (NumBits <= 32 || NumBits > 64)
15497 if (!Subtarget->hasFullRate64Ops()) {
15498 unsigned NumUsers = 0;
15499 for (SDNode *User :
LHS->
users()) {
15502 if (!
User->isAnyAdd())
15526 bool MulSignedLo =
false;
15527 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15536 if (VT != MVT::i64) {
15559 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15561 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15562 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15564 if (!MulLHSUnsigned32) {
15571 if (!MulRHSUnsigned32) {
15582 if (VT != MVT::i64)
15588SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
15589 DAGCombinerInfo &DCI)
const {
15599 SelectionDAG &DAG = DCI.DAG;
15614 unsigned Opcode =
N->getOpcode();
15615 if (Opcode == ISD::PTRADD)
15618 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
15629static std::optional<ByteProvider<SDValue>>
15632 if (!Byte0 || Byte0->isConstantZero()) {
15633 return std::nullopt;
15636 if (Byte1 && !Byte1->isConstantZero()) {
15637 return std::nullopt;
15643 unsigned FirstCs =
First & 0x0c0c0c0c;
15644 unsigned SecondCs = Second & 0x0c0c0c0c;
15645 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
15646 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15648 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15649 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15650 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15651 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15653 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15677 for (
int BPI = 0; BPI < 2; BPI++) {
15680 BPP = {Src1, Src0};
15682 unsigned ZeroMask = 0x0c0c0c0c;
15683 unsigned FMask = 0xFF << (8 * (3 - Step));
15685 unsigned FirstMask =
15686 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15687 unsigned SecondMask =
15688 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15692 int FirstGroup = -1;
15693 for (
int I = 0;
I < 2;
I++) {
15695 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
15696 return IterElt.SrcOp == *BPP.first.Src &&
15697 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15701 if (Match != Srcs.
end()) {
15702 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
15707 if (FirstGroup != -1) {
15709 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
15710 return IterElt.SrcOp == *BPP.second.Src &&
15711 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15714 if (Match != Srcs.
end()) {
15715 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
15717 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15725 unsigned ZeroMask = 0x0c0c0c0c;
15726 unsigned FMask = 0xFF << (8 * (3 - Step));
15730 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15734 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15743 if (Srcs.
size() == 1) {
15744 auto *Elt = Srcs.
begin();
15748 if (Elt->PermMask == 0x3020100)
15755 auto *FirstElt = Srcs.
begin();
15756 auto *SecondElt = std::next(FirstElt);
15763 auto FirstMask = FirstElt->PermMask;
15764 auto SecondMask = SecondElt->PermMask;
15766 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15767 unsigned FirstPlusFour = FirstMask | 0x04040404;
15770 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15782 FirstElt = std::next(SecondElt);
15783 if (FirstElt == Srcs.
end())
15786 SecondElt = std::next(FirstElt);
15789 if (SecondElt == Srcs.
end()) {
15795 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
15801 return Perms.
size() == 2
15807 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15808 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15809 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15810 EntryMask += ZeroMask;
15815 auto Opcode =
Op.getOpcode();
15821static std::optional<bool>
15832 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15835 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15837 assert(!(S0IsUnsigned && S0IsSigned));
15838 assert(!(S1IsUnsigned && S1IsSigned));
15846 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15852 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15853 return std::nullopt;
15865 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15866 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15871 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15877 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15878 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15879 return std::nullopt;
15885 DAGCombinerInfo &DCI)
const {
15886 SelectionDAG &DAG = DCI.DAG;
15887 EVT VT =
N->getValueType(0);
15893 if (Subtarget->hasMad64_32()) {
15894 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
15899 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
15903 if (VT == MVT::i64) {
15904 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15909 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15911 std::optional<bool> IsSigned;
15917 int ChainLength = 0;
15918 for (
int I = 0;
I < 4;
I++) {
15922 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15925 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15930 TempNode->getOperand(MulIdx), *Src0, *Src1,
15931 TempNode->getOperand(MulIdx)->getOperand(0),
15932 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15936 IsSigned = *IterIsSigned;
15937 if (*IterIsSigned != *IsSigned)
15940 auto AddIdx = 1 - MulIdx;
15943 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
15944 Src2s.
push_back(TempNode->getOperand(AddIdx));
15954 TempNode->getOperand(AddIdx), *Src0, *Src1,
15955 TempNode->getOperand(AddIdx)->getOperand(0),
15956 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15960 if (*IterIsSigned != *IsSigned)
15964 ChainLength =
I + 2;
15968 TempNode = TempNode->getOperand(AddIdx);
15970 ChainLength =
I + 1;
15971 if (TempNode->getNumOperands() < 2)
15973 LHS = TempNode->getOperand(0);
15974 RHS = TempNode->getOperand(1);
15977 if (ChainLength < 2)
15983 if (ChainLength < 4) {
15993 bool UseOriginalSrc =
false;
15994 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
15995 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
15996 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
15997 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
15998 SmallVector<unsigned, 4> SrcBytes;
15999 auto Src0Mask = Src0s.
begin()->PermMask;
16000 SrcBytes.
push_back(Src0Mask & 0xFF000000);
16001 bool UniqueEntries =
true;
16002 for (
auto I = 1;
I < 4;
I++) {
16003 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
16006 UniqueEntries =
false;
16012 if (UniqueEntries) {
16013 UseOriginalSrc =
true;
16015 auto *FirstElt = Src0s.
begin();
16019 auto *SecondElt = Src1s.
begin();
16021 SecondElt->DWordOffset);
16030 if (!UseOriginalSrc) {
16037 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16040 : Intrinsic::amdgcn_udot4,
16050 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16055 unsigned Opc =
LHS.getOpcode();
16067 auto Cond =
RHS.getOperand(0);
16072 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16089 DAGCombinerInfo &DCI)
const {
16090 SelectionDAG &DAG = DCI.DAG;
16092 EVT VT =
N->getValueType(0);
16105 SDNodeFlags ShlFlags = N1->
getFlags();
16109 SDNodeFlags NewShlFlags =
16114 DCI.AddToWorklist(Inner.
getNode());
16121 if (Subtarget->hasMad64_32()) {
16122 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16131 if (VT == MVT::i64) {
16132 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16145 if (!YIsConstant && !ZIsConstant && !
X->isDivergent() &&
16146 Y->isDivergent() !=
Z->isDivergent()) {
16155 if (
Y->isDivergent())
16158 SDNodeFlags ReassocFlags =
16161 DCI.AddToWorklist(UniformInner.
getNode());
16169 DAGCombinerInfo &DCI)
const {
16170 SelectionDAG &DAG = DCI.DAG;
16171 EVT VT =
N->getValueType(0);
16173 if (VT == MVT::i64) {
16174 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16178 if (VT != MVT::i32)
16187 unsigned Opc =
RHS.getOpcode();
16194 auto Cond =
RHS.getOperand(0);
16199 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16217SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
16218 DAGCombinerInfo &DCI)
const {
16220 if (
N->getValueType(0) != MVT::i32)
16226 SelectionDAG &DAG = DCI.DAG;
16231 unsigned LHSOpc =
LHS.getOpcode();
16232 unsigned Opc =
N->getOpcode();
16236 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
16242 DAGCombinerInfo &DCI)
const {
16246 SelectionDAG &DAG = DCI.DAG;
16247 EVT VT =
N->getValueType(0);
16259 if (
A ==
LHS.getOperand(1)) {
16260 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16261 if (FusedOp != 0) {
16263 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
16271 if (
A ==
RHS.getOperand(1)) {
16272 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16273 if (FusedOp != 0) {
16275 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
16284 DAGCombinerInfo &DCI)
const {
16288 SelectionDAG &DAG = DCI.DAG;
16290 EVT VT =
N->getValueType(0);
16303 if (
A ==
LHS.getOperand(1)) {
16304 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16305 if (FusedOp != 0) {
16309 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
16318 if (
A ==
RHS.getOperand(1)) {
16319 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16320 if (FusedOp != 0) {
16322 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
16331 DAGCombinerInfo &DCI)
const {
16332 SelectionDAG &DAG = DCI.DAG;
16334 EVT VT =
N->getValueType(0);
16335 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16341 SDNodeFlags
Flags =
N->getFlags();
16342 SDNodeFlags RHSFlags =
RHS->getFlags();
16348 bool IsNegative =
false;
16349 if (CLHS->isExactlyValue(1.0) ||
16350 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16353 if (
RHS.getOpcode() == ISD::FSQRT) {
16357 return IsNegative ? DAG.
getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16366 DAGCombinerInfo &DCI)
const {
16367 SelectionDAG &DAG = DCI.DAG;
16368 EVT VT =
N->getValueType(0);
16372 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
16373 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16388 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16393 const ConstantFPSDNode *FalseNode =
16403 if (ScalarVT == MVT::f32 &&
16409 if (TrueNodeExpVal == INT_MIN)
16412 if (FalseNodeExpVal == INT_MIN)
16425 return DAG.
getNode(ISD::FLDEXP, SL, VT,
LHS, SelectNode,
N->getFlags());
16432 DAGCombinerInfo &DCI)
const {
16433 SelectionDAG &DAG = DCI.DAG;
16434 EVT VT =
N->getValueType(0);
16437 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16455 (
N->getFlags().hasAllowContract() &&
16456 FMA->getFlags().hasAllowContract())) {
16471 if (FMAOp1.
getOpcode() != ISD::FP_EXTEND ||
16490 if (Vec1 == Vec2 || Vec3 == Vec4)
16496 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16505 DAGCombinerInfo &DCI)
const {
16506 SelectionDAG &DAG = DCI.DAG;
16511 EVT VT =
LHS.getValueType();
16540 return LHS.getOperand(0);
16548 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
16555 const APInt &CT =
LHS.getConstantOperandAPInt(1);
16556 const APInt &CF =
LHS.getConstantOperandAPInt(2);
16564 return LHS.getOperand(0);
16568 if (VT != MVT::f32 && VT != MVT::f64 &&
16569 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16577 LHS.getOpcode() == ISD::FABS) {
16584 const unsigned IsInfMask =
16586 const unsigned IsFiniteMask =
16600SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
16601 DAGCombinerInfo &DCI)
const {
16602 SelectionDAG &DAG = DCI.DAG;
16623 unsigned ShiftOffset = 8 *
Offset;
16625 ShiftOffset -=
C->getZExtValue();
16627 ShiftOffset +=
C->getZExtValue();
16629 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16631 MVT::f32, Shifted);
16642 DCI.AddToWorklist(
N);
16649 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
16655 DAGCombinerInfo &DCI)
const {
16660 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16664 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16665 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
16668 APFloat One(
F.getSemantics(),
"1.0");
16670 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
16676 DAGCombinerInfo &DCI)
const {
16697 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
16698 bool isInteger =
LHS.getValueType().isInteger();
16701 if (!isFloatingPoint && !isInteger)
16706 if (!isEquality && !isNonEquality)
16723 if (isFloatingPoint) {
16725 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16736 if (!(isEquality && TrueVal == ConstVal) &&
16737 !(isNonEquality && FalseVal == ConstVal))
16744 SelectLHS, SelectRHS);
16749 switch (
N->getOpcode()) {
16765 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
16775 switch (
N->getOpcode()) {
16777 return performAddCombine(
N, DCI);
16779 return performPtrAddCombine(
N, DCI);
16781 return performSubCombine(
N, DCI);
16784 return performAddCarrySubCarryCombine(
N, DCI);
16786 return performFAddCombine(
N, DCI);
16788 return performFSubCombine(
N, DCI);
16790 return performFDivCombine(
N, DCI);
16792 return performFMulCombine(
N, DCI);
16794 return performSetCCCombine(
N, DCI);
16796 if (
auto Res = performSelectCombine(
N, DCI))
16801 case ISD::FMAXNUM_IEEE:
16802 case ISD::FMINNUM_IEEE:
16803 case ISD::FMAXIMUM:
16804 case ISD::FMINIMUM:
16805 case ISD::FMAXIMUMNUM:
16806 case ISD::FMINIMUMNUM:
16813 return performMinMaxCombine(
N, DCI);
16815 return performFMACombine(
N, DCI);
16817 return performAndCombine(
N, DCI);
16819 return performOrCombine(
N, DCI);
16822 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
16823 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16829 return performXorCombine(
N, DCI);
16831 return performZeroExtendCombine(
N, DCI);
16833 return performSignExtendInRegCombine(
N, DCI);
16835 return performClassCombine(
N, DCI);
16837 return performFCanonicalizeCombine(
N, DCI);
16839 return performRcpCombine(
N, DCI);
16854 return performUCharToFloatCombine(
N, DCI);
16856 return performFCopySignCombine(
N, DCI);
16861 return performCvtF32UByteNCombine(
N, DCI);
16863 return performFMed3Combine(
N, DCI);
16865 return performCvtPkRTZCombine(
N, DCI);
16867 return performClampCombine(
N, DCI);
16870 EVT VT =
N->getValueType(0);
16873 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16876 EVT EltVT = Src.getValueType();
16877 if (EltVT != MVT::i16)
16878 Src = DAG.
getNode(ISD::BITCAST, SL, MVT::i16, Src);
16881 return DAG.
getNode(ISD::BITCAST, SL, VT, Ext);
16887 return performExtractVectorEltCombine(
N, DCI);
16889 return performInsertVectorEltCombine(
N, DCI);
16891 return performFPRoundCombine(
N, DCI);
16900 return performMemSDNodeCombine(MemNode, DCI);
16931 unsigned Opcode =
Node->getMachineOpcode();
16934 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16935 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
16938 SDNode *
Users[5] = {
nullptr};
16940 unsigned DmaskIdx =
16941 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16942 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
16943 unsigned NewDmask = 0;
16944 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16945 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16946 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
16947 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
16948 unsigned TFCLane = 0;
16949 bool HasChain =
Node->getNumValues() > 1;
16951 if (OldDmask == 0) {
16959 TFCLane = OldBitsSet;
16963 for (SDUse &Use :
Node->uses()) {
16966 if (
Use.getResNo() != 0)
16969 SDNode *
User =
Use.getUser();
16972 if (!
User->isMachineOpcode() ||
16973 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
16985 if (UsesTFC && Lane == TFCLane) {
16990 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
16992 Dmask &= ~(1 << Comp);
17000 NewDmask |= 1 << Comp;
17005 bool NoChannels = !NewDmask;
17012 if (OldBitsSet == 1)
17018 if (NewDmask == OldDmask)
17027 unsigned NewChannels = BitsSet + UsesTFC;
17031 assert(NewOpcode != -1 &&
17032 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
17033 "failed to find equivalent MIMG op");
17041 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
17043 MVT ResultVT = NewChannels == 1
17046 : NewChannels == 5 ? 8
17048 SDVTList NewVTList =
17051 MachineSDNode *NewNode =
17060 if (NewChannels == 1) {
17070 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17075 if (i || !NoChannels)
17080 if (NewUser != User) {
17090 Idx = AMDGPU::sub1;
17093 Idx = AMDGPU::sub2;
17096 Idx = AMDGPU::sub3;
17099 Idx = AMDGPU::sub4;
17110 Op =
Op.getOperand(0);
17131 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17135 Node->getOperand(0), SL, VReg, SrcVal,
17141 return ToResultReg.
getNode();
17146 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
17148 Ops.push_back(
Node->getOperand(i));
17154 Node->getOperand(i).getValueType(),
17155 Node->getOperand(i)),
17167 unsigned Opcode =
Node->getMachineOpcode();
17169 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
17170 !
TII->isGather4(Opcode) &&
17172 return adjustWritemask(
Node, DAG);
17175 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17181 case AMDGPU::V_DIV_SCALE_F32_e64:
17182 case AMDGPU::V_DIV_SCALE_F64_e64: {
17192 (Src0 == Src1 || Src0 == Src2))
17248 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
17249 unsigned InitIdx = 0;
17251 if (
TII->isImage(
MI)) {
17259 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
17260 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
17261 unsigned D16Val = D16 ? D16->getImm() : 0;
17263 if (!TFEVal && !LWEVal)
17274 assert(MO_Dmask &&
"Expected dmask operand in instruction");
17276 unsigned dmask = MO_Dmask->
getImm();
17281 bool Packed = !Subtarget->hasUnpackedD16VMem();
17283 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17289 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
17290 if (DstSize < InitIdx)
17293 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
17301 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
17302 unsigned NewDst = 0;
17307 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17308 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17311 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17312 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
17332 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
17345 if (
TII->isVOP3(
MI.getOpcode())) {
17347 TII->legalizeOperandsVOP3(
MRI,
MI);
17352 if (!
MI.getDesc().operands().empty()) {
17353 unsigned Opc =
MI.getOpcode();
17354 bool HasAGPRs = Info->mayNeedAGPRs();
17356 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
17358 {AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src0),
17359 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1), Src2Idx}) {
17362 if ((
I == Src2Idx) && (HasAGPRs))
17365 if (!
Op.isReg() || !
Op.getReg().isVirtual())
17367 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
17368 if (!
TRI->hasAGPRs(RC))
17370 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
17371 if (!Src || !Src->isCopy() ||
17372 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
17374 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
17378 MRI.setRegClass(
Op.getReg(), NewRC);
17381 if (
TII->isMAI(
MI)) {
17386 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17387 AMDGPU::OpName::scale_src0);
17388 if (Src0Idx != -1) {
17389 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17390 AMDGPU::OpName::scale_src1);
17391 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
17392 TII->usesConstantBus(
MRI,
MI, Src1Idx))
17393 TII->legalizeOpWithMove(
MI, Src1Idx);
17401 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
17402 if (Src2->isReg() && Src2->getReg().isVirtual()) {
17403 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
17404 if (
TRI->isVectorSuperClass(RC)) {
17405 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
17406 MRI.setRegClass(Src2->getReg(), NewRC);
17407 if (Src2->isTied())
17408 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
17417 if (
TII->isImage(
MI))
17418 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
17492std::pair<unsigned, const TargetRegisterClass *>
17499 if (Constraint.
size() == 1) {
17503 if (VT == MVT::Other)
17506 switch (Constraint[0]) {
17513 RC = &AMDGPU::SReg_32RegClass;
17516 RC = &AMDGPU::SGPR_64RegClass;
17521 return std::pair(0U,
nullptr);
17528 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17529 : &AMDGPU::VGPR_32_Lo256RegClass;
17532 RC = Subtarget->has1024AddressableVGPRs()
17533 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
17536 return std::pair(0U,
nullptr);
17541 if (!Subtarget->hasMAIInsts())
17545 RC = &AMDGPU::AGPR_32RegClass;
17550 return std::pair(0U,
nullptr);
17555 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
17559 RC = &AMDGPU::AV_32RegClass;
17562 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
17564 return std::pair(0U,
nullptr);
17573 return std::pair(0U, RC);
17576 if (Kind !=
'\0') {
17578 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17579 }
else if (Kind ==
's') {
17580 RC = &AMDGPU::SGPR_32RegClass;
17581 }
else if (Kind ==
'a') {
17582 RC = &AMDGPU::AGPR_32RegClass;
17588 return std::pair(0U,
nullptr);
17594 return std::pair(0U,
nullptr);
17598 RC =
TRI->getVGPRClassForBitWidth(Width);
17600 RC =
TRI->getSGPRClassForBitWidth(Width);
17602 RC =
TRI->getAGPRClassForBitWidth(Width);
17604 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17609 return std::pair(0U,
nullptr);
17611 return std::pair(Reg, RC);
17617 return std::pair(0U,
nullptr);
17618 if (Idx < RC->getNumRegs())
17620 return std::pair(0U,
nullptr);
17626 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
17632 if (Constraint.
size() == 1) {
17633 switch (Constraint[0]) {
17643 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
17651 if (Constraint.
size() == 1) {
17652 switch (Constraint[0]) {
17660 }
else if (Constraint.
size() == 2) {
17661 if (Constraint ==
"VA")
17679 std::vector<SDValue> &
Ops,
17694 unsigned Size =
Op.getScalarValueSizeInBits();
17698 if (
Size == 16 && !Subtarget->has16BitInsts())
17702 Val =
C->getSExtValue();
17706 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17710 if (
Size != 16 ||
Op.getNumOperands() != 2)
17712 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
17715 Val =
C->getSExtValue();
17719 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17729 if (Constraint.
size() == 1) {
17730 switch (Constraint[0]) {
17745 }
else if (Constraint.
size() == 2) {
17746 if (Constraint ==
"DA") {
17747 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
17748 int64_t LoBits =
static_cast<int32_t
>(Val);
17752 if (Constraint ==
"DB") {
17760 unsigned MaxSize)
const {
17761 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
17762 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17764 MVT VT =
Op.getSimpleValueType();
17789 switch (UnalignedClassID) {
17790 case AMDGPU::VReg_64RegClassID:
17791 return AMDGPU::VReg_64_Align2RegClassID;
17792 case AMDGPU::VReg_96RegClassID:
17793 return AMDGPU::VReg_96_Align2RegClassID;
17794 case AMDGPU::VReg_128RegClassID:
17795 return AMDGPU::VReg_128_Align2RegClassID;
17796 case AMDGPU::VReg_160RegClassID:
17797 return AMDGPU::VReg_160_Align2RegClassID;
17798 case AMDGPU::VReg_192RegClassID:
17799 return AMDGPU::VReg_192_Align2RegClassID;
17800 case AMDGPU::VReg_224RegClassID:
17801 return AMDGPU::VReg_224_Align2RegClassID;
17802 case AMDGPU::VReg_256RegClassID:
17803 return AMDGPU::VReg_256_Align2RegClassID;
17804 case AMDGPU::VReg_288RegClassID:
17805 return AMDGPU::VReg_288_Align2RegClassID;
17806 case AMDGPU::VReg_320RegClassID:
17807 return AMDGPU::VReg_320_Align2RegClassID;
17808 case AMDGPU::VReg_352RegClassID:
17809 return AMDGPU::VReg_352_Align2RegClassID;
17810 case AMDGPU::VReg_384RegClassID:
17811 return AMDGPU::VReg_384_Align2RegClassID;
17812 case AMDGPU::VReg_512RegClassID:
17813 return AMDGPU::VReg_512_Align2RegClassID;
17814 case AMDGPU::VReg_1024RegClassID:
17815 return AMDGPU::VReg_1024_Align2RegClassID;
17816 case AMDGPU::AReg_64RegClassID:
17817 return AMDGPU::AReg_64_Align2RegClassID;
17818 case AMDGPU::AReg_96RegClassID:
17819 return AMDGPU::AReg_96_Align2RegClassID;
17820 case AMDGPU::AReg_128RegClassID:
17821 return AMDGPU::AReg_128_Align2RegClassID;
17822 case AMDGPU::AReg_160RegClassID:
17823 return AMDGPU::AReg_160_Align2RegClassID;
17824 case AMDGPU::AReg_192RegClassID:
17825 return AMDGPU::AReg_192_Align2RegClassID;
17826 case AMDGPU::AReg_256RegClassID:
17827 return AMDGPU::AReg_256_Align2RegClassID;
17828 case AMDGPU::AReg_512RegClassID:
17829 return AMDGPU::AReg_512_Align2RegClassID;
17830 case AMDGPU::AReg_1024RegClassID:
17831 return AMDGPU::AReg_1024_Align2RegClassID;
17847 if (Info->isEntryFunction()) {
17854 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17856 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17857 :
TRI->getAlignedHighSGPRForRC(MF, 2,
17858 &AMDGPU::SGPR_64RegClass);
17859 Info->setSGPRForEXECCopy(SReg);
17861 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
17862 Info->getStackPtrOffsetReg()));
17863 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17864 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17868 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17869 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17871 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17872 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17874 Info->limitOccupancy(MF);
17876 if (ST.isWave32() && !MF.
empty()) {
17877 for (
auto &
MBB : MF) {
17878 for (
auto &
MI :
MBB) {
17879 TII->fixImplicitOperands(
MI);
17889 if (ST.needsAlignedVGPRs()) {
17890 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
17896 if (NewClassID != -1)
17897 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
17906 const APInt &DemandedElts,
17908 unsigned Depth)
const {
17910 unsigned Opc =
Op.getOpcode();
17913 unsigned IID =
Op.getConstantOperandVal(0);
17915 case Intrinsic::amdgcn_mbcnt_lo:
17916 case Intrinsic::amdgcn_mbcnt_hi: {
17922 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17932 Op, Known, DemandedElts, DAG,
Depth);
17948 unsigned MaxValue =
17955 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
17959 unsigned Src1Cst = 0;
17960 if (Src1.
isImm()) {
17961 Src1Cst = Src1.
getImm();
17962 }
else if (Src1.
isReg()) {
17966 Src1Cst = Cst->Value.getZExtValue();
17977 if (Width >= BFEWidth)
17986 Known = Known.
sext(BFEWidth);
17988 Known = Known.
zext(BFEWidth);
17994 unsigned Depth)
const {
17997 switch (
MI->getOpcode()) {
17998 case AMDGPU::S_BFE_I32:
18001 case AMDGPU::S_BFE_U32:
18004 case AMDGPU::S_BFE_I64:
18007 case AMDGPU::S_BFE_U64:
18010 case AMDGPU::G_INTRINSIC:
18011 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18014 case Intrinsic::amdgcn_workitem_id_x:
18017 case Intrinsic::amdgcn_workitem_id_y:
18020 case Intrinsic::amdgcn_workitem_id_z:
18023 case Intrinsic::amdgcn_mbcnt_lo:
18024 case Intrinsic::amdgcn_mbcnt_hi: {
18036 case Intrinsic::amdgcn_groupstaticsize: {
18047 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18050 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18053 case AMDGPU::G_AMDGPU_SMED3:
18054 case AMDGPU::G_AMDGPU_UMED3: {
18055 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
18082 unsigned Depth)
const {
18089 AttributeList Attrs =
18091 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
18118 if (Header->getAlignment() != PrefAlign)
18119 return Header->getAlignment();
18121 unsigned LoopSize = 0;
18126 LoopSize +=
MBB->getAlignment().value() / 2;
18129 LoopSize +=
TII->getInstSizeInBytes(
MI);
18130 if (LoopSize > 192)
18135 if (LoopSize <= 64)
18138 if (LoopSize <= 128)
18139 return CacheLineAlign;
18145 auto I = Exit->getFirstNonDebugInstr();
18146 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18147 return CacheLineAlign;
18156 if (PreTerm == Pre->
begin() ||
18157 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18161 auto ExitHead = Exit->getFirstNonDebugInstr();
18162 if (ExitHead == Exit->end() ||
18163 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18168 return CacheLineAlign;
18176 N =
N->getOperand(0).getNode();
18177 if (
N->getOpcode() == ISD::INLINEASM ||
N->getOpcode() == ISD::INLINEASM_BR)
18186 switch (
N->getOpcode()) {
18194 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
18195 return !
TRI->isSGPRReg(
MRI, Reg);
18201 return !
TRI->isSGPRReg(
MRI, Reg);
18205 unsigned AS = L->getAddressSpace();
18209 case ISD::CALLSEQ_END:
18238 return A->readMem() &&
A->writeMem();
18259 switch (Ty.getScalarSizeInBits()) {
18271 const APInt &DemandedElts,
18274 unsigned Depth)
const {
18279 if (Info->getMode().DX10Clamp)
18291 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
18311 <<
"Hardware instruction generated for atomic "
18313 <<
" operation at memory scope " << MemScope;
18318 Type *EltTy = VT->getElementType();
18319 return VT->getNumElements() == 2 &&
18339 unsigned BW =
IT->getBitWidth();
18340 return BW == 32 || BW == 64;
18354 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
18355 return BW == 32 || BW == 64;
18358 if (Ty->isFloatTy() || Ty->isDoubleTy())
18362 return VT->getNumElements() == 2 &&
18363 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18373 bool HasSystemScope) {
18380 if (HasSystemScope) {
18389 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
18402 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
18428 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
18441 bool HasSystemScope =
18467 if (Subtarget->hasEmulatedSystemScopeAtomics())
18483 if (!HasSystemScope &&
18484 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18496 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
18504 ConstVal && ConstVal->isNullValue())
18542 if (Ty->isFloatTy()) {
18547 if (Ty->isDoubleTy()) {
18568 if (Ty->isFloatTy() &&
18569 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18582 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18586 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
18590 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18595 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
18600 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18604 if (Ty->isFloatTy()) {
18607 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18610 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18615 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18623 if (Subtarget->hasFlatAtomicFaddF32Inst())
18632 if (Subtarget->hasLDSFPAtomicAddF32()) {
18633 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18635 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18663 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18665 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18669 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18671 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18724 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18725 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18726 : &AMDGPU::SReg_32RegClass;
18727 if (!
TRI->isSGPRClass(RC) && !isDivergent)
18728 return TRI->getEquivalentSGPRClass(RC);
18729 if (
TRI->isSGPRClass(RC) && isDivergent)
18730 return TRI->getEquivalentVGPRClass(RC);
18742 unsigned WaveSize) {
18747 if (!
IT ||
IT->getBitWidth() != WaveSize)
18752 if (!Visited.
insert(V).second)
18754 bool Result =
false;
18755 for (
const auto *U : V->users()) {
18757 if (V == U->getOperand(1)) {
18762 case Intrinsic::amdgcn_if_break:
18763 case Intrinsic::amdgcn_if:
18764 case Intrinsic::amdgcn_else:
18769 if (V == U->getOperand(0)) {
18774 case Intrinsic::amdgcn_end_cf:
18775 case Intrinsic::amdgcn_loop:
18781 Result =
hasCFUser(U, Visited, WaveSize);
18790 const Value *V)
const {
18792 if (CI->isInlineAsm()) {
18801 for (
auto &TC : TargetConstraints) {
18815 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18843 return MRI.hasOneNonDBGUse(N0);
18850 if (
I.getMetadata(
"amdgpu.noclobber"))
18852 if (
I.getMetadata(
"amdgpu.last.use"))
18862 if (!Def->isMachineOpcode())
18872 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18873 PhysReg = AMDGPU::SCC;
18875 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18941 Alignment = RMW->getAlign();
18954 bool FullFlatEmulation =
18956 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18957 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18958 RMW->getType()->isDoubleTy()));
18961 bool ReturnValueIsUsed = !AI->
use_empty();
18970 if (FullFlatEmulation) {
18981 std::prev(BB->
end())->eraseFromParent();
18982 Builder.SetInsertPoint(BB);
18984 Value *LoadedShared =
nullptr;
18985 if (FullFlatEmulation) {
18986 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
18987 {Addr},
nullptr,
"is.shared");
18988 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
18989 Builder.SetInsertPoint(SharedBB);
18990 Value *CastToLocal = Builder.CreateAddrSpaceCast(
18996 LoadedShared = Clone;
18998 Builder.CreateBr(PhiBB);
18999 Builder.SetInsertPoint(CheckPrivateBB);
19002 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19003 {Addr},
nullptr,
"is.private");
19004 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19006 Builder.SetInsertPoint(PrivateBB);
19008 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19011 Value *LoadedPrivate;
19013 LoadedPrivate = Builder.CreateAlignedLoad(
19014 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
19017 LoadedPrivate, RMW->getValOperand());
19019 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19021 auto [ResultLoad, Equal] =
19027 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19030 Builder.CreateBr(PhiBB);
19032 Builder.SetInsertPoint(GlobalBB);
19036 if (FullFlatEmulation) {
19037 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19046 if (!FullFlatEmulation) {
19051 MDNode *RangeNotPrivate =
19054 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
19058 Builder.CreateBr(PhiBB);
19060 Builder.SetInsertPoint(PhiBB);
19062 if (ReturnValueIsUsed) {
19065 if (FullFlatEmulation)
19072 Builder.CreateBr(ExitBB);
19076 unsigned PtrOpIdx) {
19077 Value *PtrOp =
I->getOperand(PtrOpIdx);
19084 I->setOperand(PtrOpIdx, ASCast);
19096 ConstVal && ConstVal->isNullValue()) {
19126 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19134 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19149 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned AndSaveExecOpc
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
constexpr bool empty() const
empty - Check if the string is empty.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ TC_RETURN_GFX_WholeWave
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ SMULO
Same for multiplication.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
int popcount(T Value) noexcept
Count the number of set bits in a value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const