41#include "llvm/IR/IntrinsicsAMDGPU.h"
42#include "llvm/IR/IntrinsicsR600.h"
53#define DEBUG_TYPE "si-lower"
59 cl::desc(
"Do not align and prefetch loops"),
63 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
64 cl::desc(
"Use indirect register addressing for divergent indexes"),
71 cl::desc(
"Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
86 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
87 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
89 return AMDGPU::SGPR0 +
Reg;
161 if (Subtarget->has16BitInsts()) {
162 if (Subtarget->useRealTrue16Insts()) {
204 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
205 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
206 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
207 MVT::i1, MVT::v32i32},
211 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
212 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
213 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
214 MVT::i1, MVT::v32i32},
221 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
222 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
223 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
224 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
225 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
283 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
290 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
291 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
292 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
295 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
296 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
297 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
301 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
302 MVT::v3i16, MVT::v4i16, MVT::Other},
307 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
323 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
324 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
325 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
326 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
327 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
328 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
329 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
330 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
362 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
376 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
390 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
404 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
418 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
433 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
434 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
437 if (Subtarget->hasPkMovB32()) {
458 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
459 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
464 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
468 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
469 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
470 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
471 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
495 if (Subtarget->hasSMemRealTime() ||
500 if (Subtarget->has16BitInsts()) {
507 if (Subtarget->hasMadMacF32Insts())
510 if (!Subtarget->hasBFI())
514 if (!Subtarget->hasBCNT(32))
517 if (!Subtarget->hasBCNT(64))
520 if (Subtarget->hasFFBH())
523 if (Subtarget->hasFFBL())
534 if (Subtarget->hasBFE())
538 if (Subtarget->hasIntClamp())
541 if (Subtarget->hasAddNoCarry())
546 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
547 {MVT::f32, MVT::f64},
Custom);
553 {MVT::f32, MVT::f64},
Legal);
555 if (Subtarget->haveRoundOpsF64())
578 if (Subtarget->has16BitInsts()) {
627 ISD::FSIN, ISD::FROUND},
631 if (Subtarget->hasBF16TransInsts())
650 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
651 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
652 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
785 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
786 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
787 MVT::v32f16, MVT::v32bf16},
791 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
797 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
801 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
805 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
806 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
814 if (Subtarget->hasVOP3PInsts()) {
825 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
828 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
829 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
830 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
833 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
841 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
847 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
848 {MVT::v2f16, MVT::v4f16},
Custom);
854 if (Subtarget->hasPackedFP32Ops()) {
858 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
865 if (Subtarget->has16BitInsts()) {
878 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
879 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
880 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
881 MVT::v32f16, MVT::v32bf16},
886 if (Subtarget->hasVectorMulU64())
888 else if (Subtarget->hasScalarSMulU64())
891 if (Subtarget->hasMad64_32())
894 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
897 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
899 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
902 if (Subtarget->hasMinimum3Maximum3F32())
905 if (Subtarget->hasMinimum3Maximum3PKF16()) {
909 if (!Subtarget->hasMinimum3Maximum3F16())
914 if (Subtarget->hasVOP3PInsts()) {
917 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
921 if (Subtarget->hasIntMinMax64())
926 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
927 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
932 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
933 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
934 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
935 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
939 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
940 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
941 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
942 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
957 if (Subtarget->hasBF16ConversionInsts()) {
962 if (Subtarget->hasBF16PackedInsts()) {
968 if (Subtarget->hasBF16TransInsts()) {
972 if (Subtarget->hasCvtPkF16F32Inst()) {
974 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1024 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1033 ISD::ATOMIC_CMP_SWAP,
1034 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1036 ISD::ATOMIC_LOAD_ADD,
1037 ISD::ATOMIC_LOAD_SUB,
1038 ISD::ATOMIC_LOAD_AND,
1039 ISD::ATOMIC_LOAD_OR,
1040 ISD::ATOMIC_LOAD_XOR,
1041 ISD::ATOMIC_LOAD_NAND,
1042 ISD::ATOMIC_LOAD_MIN,
1043 ISD::ATOMIC_LOAD_MAX,
1044 ISD::ATOMIC_LOAD_UMIN,
1045 ISD::ATOMIC_LOAD_UMAX,
1046 ISD::ATOMIC_LOAD_FADD,
1047 ISD::ATOMIC_LOAD_FMIN,
1048 ISD::ATOMIC_LOAD_FMAX,
1049 ISD::ATOMIC_LOAD_UINC_WRAP,
1050 ISD::ATOMIC_LOAD_UDEC_WRAP,
1063 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1076 EVT DestVT,
EVT SrcVT)
const {
1078 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1079 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1081 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1088 LLT DestTy,
LLT SrcTy)
const {
1089 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1090 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1092 SrcTy.getScalarSizeInBits() == 16 &&
1113 if (Subtarget->has16BitInsts()) {
1116 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1118 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1122 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1144 if (
Size == 16 && Subtarget->has16BitInsts())
1145 return (NumElts + 1) / 2;
1151 return NumElts * ((
Size + 31) / 32);
1160 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1168 if (
Size == 16 && Subtarget->has16BitInsts()) {
1169 if (ScalarVT == MVT::bf16) {
1170 RegisterVT = MVT::i32;
1171 IntermediateVT = MVT::v2bf16;
1173 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1174 IntermediateVT = RegisterVT;
1176 NumIntermediates = (NumElts + 1) / 2;
1177 return NumIntermediates;
1182 IntermediateVT = RegisterVT;
1183 NumIntermediates = NumElts;
1184 return NumIntermediates;
1189 RegisterVT = MVT::i16;
1190 IntermediateVT = ScalarVT;
1191 NumIntermediates = NumElts;
1192 return NumIntermediates;
1196 RegisterVT = MVT::i32;
1197 IntermediateVT = ScalarVT;
1198 NumIntermediates = NumElts;
1199 return NumIntermediates;
1203 RegisterVT = MVT::i32;
1204 IntermediateVT = RegisterVT;
1205 NumIntermediates = NumElts * ((
Size + 31) / 32);
1206 return NumIntermediates;
1211 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1216 unsigned MaxNumLanes) {
1217 assert(MaxNumLanes != 0);
1221 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1232 unsigned MaxNumLanes) {
1238 assert(ST->getNumContainedTypes() == 2 &&
1239 ST->getContainedType(1)->isIntegerTy(32));
1253 return MVT::amdgpuBufferFatPointer;
1255 DL.getPointerSizeInBits(AS) == 192)
1256 return MVT::amdgpuBufferStridedPointer;
1265 DL.getPointerSizeInBits(AS) == 160) ||
1267 DL.getPointerSizeInBits(AS) == 192))
1274 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1275 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1276 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1278 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1279 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1280 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1281 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1282 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1284 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1285 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1286 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1287 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1288 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1290 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1291 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1292 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1293 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1294 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1333 unsigned IntrID)
const {
1335 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1353 if (RsrcIntr->IsImage) {
1368 Info.ptrVal = RsrcArg;
1371 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1380 if (RsrcIntr->IsImage) {
1381 unsigned MaxNumLanes = 4;
1396 std::numeric_limits<unsigned>::max());
1406 if (RsrcIntr->IsImage) {
1427 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1429 Info.memVT = MVT::i32;
1436 case Intrinsic::amdgcn_raw_buffer_load_lds:
1437 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1438 case Intrinsic::amdgcn_struct_buffer_load_lds:
1439 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1445 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1446 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1447 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1448 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1451 std::numeric_limits<unsigned>::max());
1461 case Intrinsic::amdgcn_ds_ordered_add:
1462 case Intrinsic::amdgcn_ds_ordered_swap: {
1475 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1476 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1479 Info.ptrVal =
nullptr;
1484 case Intrinsic::amdgcn_ds_append:
1485 case Intrinsic::amdgcn_ds_consume: {
1498 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1499 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1500 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1505 Info.memVT = MVT::i64;
1511 case Intrinsic::amdgcn_global_atomic_csub: {
1520 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1521 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1522 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1525 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1528 ->getElementType(0));
1536 case Intrinsic::amdgcn_global_atomic_fmin_num:
1537 case Intrinsic::amdgcn_global_atomic_fmax_num:
1538 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1539 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1540 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1541 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1551 case Intrinsic::amdgcn_flat_load_monitor_b32:
1552 case Intrinsic::amdgcn_flat_load_monitor_b64:
1553 case Intrinsic::amdgcn_flat_load_monitor_b128:
1554 case Intrinsic::amdgcn_global_load_monitor_b32:
1555 case Intrinsic::amdgcn_global_load_monitor_b64:
1556 case Intrinsic::amdgcn_global_load_monitor_b128:
1557 case Intrinsic::amdgcn_cluster_load_b32:
1558 case Intrinsic::amdgcn_cluster_load_b64:
1559 case Intrinsic::amdgcn_cluster_load_b128:
1560 case Intrinsic::amdgcn_ds_load_tr6_b96:
1561 case Intrinsic::amdgcn_ds_load_tr4_b64:
1562 case Intrinsic::amdgcn_ds_load_tr8_b64:
1563 case Intrinsic::amdgcn_ds_load_tr16_b128:
1564 case Intrinsic::amdgcn_global_load_tr6_b96:
1565 case Intrinsic::amdgcn_global_load_tr4_b64:
1566 case Intrinsic::amdgcn_global_load_tr_b64:
1567 case Intrinsic::amdgcn_global_load_tr_b128:
1568 case Intrinsic::amdgcn_ds_read_tr4_b64:
1569 case Intrinsic::amdgcn_ds_read_tr6_b96:
1570 case Intrinsic::amdgcn_ds_read_tr8_b64:
1571 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1579 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1580 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1581 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1589 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1590 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1591 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1599 case Intrinsic::amdgcn_ds_gws_init:
1600 case Intrinsic::amdgcn_ds_gws_barrier:
1601 case Intrinsic::amdgcn_ds_gws_sema_v:
1602 case Intrinsic::amdgcn_ds_gws_sema_br:
1603 case Intrinsic::amdgcn_ds_gws_sema_p:
1604 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1614 Info.memVT = MVT::i32;
1616 Info.align =
Align(4);
1618 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1624 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1625 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1626 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1627 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1628 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1629 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1630 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1631 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1638 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1639 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1640 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1641 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1648 case Intrinsic::amdgcn_load_to_lds:
1649 case Intrinsic::amdgcn_global_load_lds: {
1657 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1658 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1659 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1660 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1670 Info.memVT = MVT::i32;
1672 Info.align =
Align(4);
1677 case Intrinsic::amdgcn_s_prefetch_data:
1678 case Intrinsic::amdgcn_flat_prefetch:
1679 case Intrinsic::amdgcn_global_prefetch: {
1694 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1697 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1698 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1710 Type *&AccessTy)
const {
1712 switch (
II->getIntrinsicID()) {
1713 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1714 case Intrinsic::amdgcn_cluster_load_b128:
1715 case Intrinsic::amdgcn_cluster_load_b64:
1716 case Intrinsic::amdgcn_cluster_load_b32:
1717 case Intrinsic::amdgcn_ds_append:
1718 case Intrinsic::amdgcn_ds_consume:
1719 case Intrinsic::amdgcn_ds_load_tr8_b64:
1720 case Intrinsic::amdgcn_ds_load_tr16_b128:
1721 case Intrinsic::amdgcn_ds_load_tr4_b64:
1722 case Intrinsic::amdgcn_ds_load_tr6_b96:
1723 case Intrinsic::amdgcn_ds_read_tr4_b64:
1724 case Intrinsic::amdgcn_ds_read_tr6_b96:
1725 case Intrinsic::amdgcn_ds_read_tr8_b64:
1726 case Intrinsic::amdgcn_ds_read_tr16_b64:
1727 case Intrinsic::amdgcn_ds_ordered_add:
1728 case Intrinsic::amdgcn_ds_ordered_swap:
1729 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1730 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1731 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1732 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1733 case Intrinsic::amdgcn_flat_load_monitor_b128:
1734 case Intrinsic::amdgcn_flat_load_monitor_b32:
1735 case Intrinsic::amdgcn_flat_load_monitor_b64:
1736 case Intrinsic::amdgcn_global_atomic_csub:
1737 case Intrinsic::amdgcn_global_atomic_fmax_num:
1738 case Intrinsic::amdgcn_global_atomic_fmin_num:
1739 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1740 case Intrinsic::amdgcn_global_load_monitor_b128:
1741 case Intrinsic::amdgcn_global_load_monitor_b32:
1742 case Intrinsic::amdgcn_global_load_monitor_b64:
1743 case Intrinsic::amdgcn_global_load_tr_b64:
1744 case Intrinsic::amdgcn_global_load_tr_b128:
1745 case Intrinsic::amdgcn_global_load_tr4_b64:
1746 case Intrinsic::amdgcn_global_load_tr6_b96:
1747 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1748 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1749 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1750 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1751 Ptr =
II->getArgOperand(0);
1753 case Intrinsic::amdgcn_load_to_lds:
1754 case Intrinsic::amdgcn_global_load_lds:
1755 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1756 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1757 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1758 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1759 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1760 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1761 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1762 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1763 Ptr =
II->getArgOperand(1);
1768 AccessTy =
II->getType();
1774 unsigned AddrSpace)
const {
1775 if (!Subtarget->hasFlatInstOffsets()) {
1786 return AM.
Scale == 0 &&
1787 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1788 AM.
BaseOffs, AddrSpace, FlatVariant));
1792 if (Subtarget->hasFlatGlobalInsts())
1795 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1808 return isLegalMUBUFAddressingMode(AM);
1811bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1822 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1834 if (AM.HasBaseReg) {
1866 return isLegalMUBUFAddressingMode(AM);
1868 if (!Subtarget->hasScalarSubwordLoads()) {
1873 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1921 return Subtarget->enableFlatScratch()
1923 : isLegalMUBUFAddressingMode(AM);
1970 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1979 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
1982 Align RequiredAlignment(
1984 if (Subtarget->hasLDSMisalignedBug() &&
Size > 32 &&
1985 Alignment < RequiredAlignment)
2000 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
2006 RequiredAlignment =
Align(4);
2008 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2024 *IsFast = (Alignment >= RequiredAlignment) ? 64
2025 : (Alignment <
Align(4)) ? 32
2032 if (!Subtarget->hasDS96AndDS128())
2038 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2047 *IsFast = (Alignment >= RequiredAlignment) ? 96
2048 : (Alignment <
Align(4)) ? 32
2055 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2061 RequiredAlignment =
Align(8);
2063 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2072 *IsFast = (Alignment >= RequiredAlignment) ? 128
2073 : (Alignment <
Align(4)) ? 32
2090 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2092 return Alignment >= RequiredAlignment ||
2093 Subtarget->hasUnalignedDSAccessEnabled();
2101 bool AlignedBy4 = Alignment >=
Align(4);
2102 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2104 *IsFast = AlignedBy4 ?
Size : 1;
2109 *IsFast = AlignedBy4;
2120 return Alignment >=
Align(4) ||
2121 Subtarget->hasUnalignedBufferAccessEnabled();
2133 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2148 return Size >= 32 && Alignment >=
Align(4);
2153 unsigned *IsFast)
const {
2155 Alignment, Flags, IsFast);
2160 const AttributeList &FuncAttributes)
const {
2166 if (
Op.size() >= 16 &&
2170 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2188 unsigned DestAS)
const {
2191 Subtarget->hasGloballyAddressableScratch()) {
2221 unsigned Index)
const {
2237 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2264 auto [InputPtrReg, RC, ArgTy] =
2274 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2280 const SDLoc &SL)
const {
2287 const SDLoc &SL)
const {
2290 std::optional<uint32_t> KnownSize =
2292 if (KnownSize.has_value())
2318 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2327SDValue SITargetLowering::lowerKernargMemParameter(
2339 int64_t OffsetDiff =
Offset - AlignDownOffset;
2345 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2354 ArgVal = DAG.
getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2355 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2365 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2374 const SDLoc &SL)
const {
2384 return DAG.
getNode(ISD::BITCAST, SL, ValVT, Val);
2443 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2446 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2447 if (ConvertedVal == ArgValue)
2448 return ConvertedVal;
2453SDValue SITargetLowering::lowerWorkGroupId(
2458 if (!Subtarget->hasClusters())
2459 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2467 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2468 SDLoc SL(ClusterIdXYZ);
2469 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2472 SDValue ClusterWorkGroupIdXYZ =
2473 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2483 return ClusterIdXYZ;
2485 using namespace AMDGPU::Hwreg;
2489 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2500SDValue SITargetLowering::getPreloadedValue(
2503 const ArgDescriptor *
Reg =
nullptr;
2504 const TargetRegisterClass *RC;
2508 const ArgDescriptor WorkGroupIDX =
2516 const ArgDescriptor WorkGroupIDZ =
2518 const ArgDescriptor ClusterWorkGroupIDX =
2520 const ArgDescriptor ClusterWorkGroupIDY =
2522 const ArgDescriptor ClusterWorkGroupIDZ =
2524 const ArgDescriptor ClusterWorkGroupMaxIDX =
2526 const ArgDescriptor ClusterWorkGroupMaxIDY =
2528 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2530 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2533 auto LoadConstant = [&](
unsigned N) {
2537 if (Subtarget->hasArchitectedSGPRs() &&
2544 Reg = &WorkGroupIDX;
2545 RC = &AMDGPU::SReg_32RegClass;
2549 Reg = &WorkGroupIDY;
2550 RC = &AMDGPU::SReg_32RegClass;
2554 Reg = &WorkGroupIDZ;
2555 RC = &AMDGPU::SReg_32RegClass;
2559 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2560 return LoadConstant(0);
2561 Reg = &ClusterWorkGroupIDX;
2562 RC = &AMDGPU::SReg_32RegClass;
2566 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2567 return LoadConstant(0);
2568 Reg = &ClusterWorkGroupIDY;
2569 RC = &AMDGPU::SReg_32RegClass;
2573 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2574 return LoadConstant(0);
2575 Reg = &ClusterWorkGroupIDZ;
2576 RC = &AMDGPU::SReg_32RegClass;
2581 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2582 Reg = &ClusterWorkGroupMaxIDX;
2583 RC = &AMDGPU::SReg_32RegClass;
2588 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2589 Reg = &ClusterWorkGroupMaxIDY;
2590 RC = &AMDGPU::SReg_32RegClass;
2595 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2596 Reg = &ClusterWorkGroupMaxIDZ;
2597 RC = &AMDGPU::SReg_32RegClass;
2601 Reg = &ClusterWorkGroupMaxFlatID;
2602 RC = &AMDGPU::SReg_32RegClass;
2633 for (
unsigned I = 0,
E = Ins.size(), PSInputNum = 0;
I !=
E; ++
I) {
2637 "vector type argument should have been split");
2642 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2650 "unexpected vector split in ps argument type");
2664 Info->markPSInputAllocated(PSInputNum);
2666 Info->markPSInputEnabled(PSInputNum);
2682 if (Info.hasWorkItemIDX()) {
2688 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2692 if (Info.hasWorkItemIDY()) {
2693 assert(Info.hasWorkItemIDX());
2694 if (Subtarget->hasPackedTID()) {
2695 Info.setWorkItemIDY(
2698 unsigned Reg = AMDGPU::VGPR1;
2706 if (Info.hasWorkItemIDZ()) {
2707 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2708 if (Subtarget->hasPackedTID()) {
2709 Info.setWorkItemIDZ(
2712 unsigned Reg = AMDGPU::VGPR2;
2732 if (RegIdx == ArgVGPRs.
size()) {
2739 unsigned Reg = ArgVGPRs[RegIdx];
2751 unsigned NumArgRegs) {
2754 if (RegIdx == ArgSGPRs.
size())
2757 unsigned Reg = ArgSGPRs[RegIdx];
2799 const unsigned Mask = 0x3ff;
2802 if (Info.hasWorkItemIDX()) {
2804 Info.setWorkItemIDX(Arg);
2807 if (Info.hasWorkItemIDY()) {
2809 Info.setWorkItemIDY(Arg);
2812 if (Info.hasWorkItemIDZ())
2824 const unsigned Mask = 0x3ff;
2833 auto &
ArgInfo = Info.getArgInfo();
2845 if (Info.hasImplicitArgPtr())
2853 if (Info.hasWorkGroupIDX())
2856 if (Info.hasWorkGroupIDY())
2859 if (Info.hasWorkGroupIDZ())
2862 if (Info.hasLDSKernelId())
2873 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2874 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2880 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2881 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2886 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2887 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2893 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2899 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
2908 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2913 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
2914 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2919 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
2920 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2935 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2937 bool InPreloadSequence =
true;
2939 bool AlignedForImplictArgs =
false;
2940 unsigned ImplicitArgOffset = 0;
2941 for (
auto &Arg :
F.args()) {
2942 if (!InPreloadSequence || !Arg.hasInRegAttr())
2945 unsigned ArgIdx = Arg.getArgNo();
2948 if (InIdx < Ins.size() &&
2949 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2952 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2953 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2955 assert(ArgLocs[ArgIdx].isMemLoc());
2956 auto &ArgLoc = ArgLocs[InIdx];
2958 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2960 unsigned NumAllocSGPRs =
2961 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2964 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2965 if (!AlignedForImplictArgs) {
2967 alignTo(LastExplicitArgOffset,
2968 Subtarget->getAlignmentForImplicitArgPtr()) -
2969 LastExplicitArgOffset;
2970 AlignedForImplictArgs =
true;
2972 ArgOffset += ImplicitArgOffset;
2976 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2977 assert(InIdx >= 1 &&
"No previous SGPR");
2978 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2979 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2983 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2984 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2987 InPreloadSequence =
false;
2993 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2995 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2997 if (PreloadRegs->
size() > 1)
2998 RC = &AMDGPU::SGPR_32RegClass;
2999 for (
auto &Reg : *PreloadRegs) {
3005 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3014 if (Info.hasLDSKernelId()) {
3015 Register Reg = Info.addLDSKernelId();
3016 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3025 bool IsShader)
const {
3026 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3027 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3033 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3035 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3039 unsigned NumRequiredSystemSGPRs =
3040 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3041 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3042 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3043 Register Reg = Info.addReservedUserSGPR();
3044 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3049 if (!HasArchitectedSGPRs) {
3050 if (Info.hasWorkGroupIDX()) {
3051 Register Reg = Info.addWorkGroupIDX();
3052 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3056 if (Info.hasWorkGroupIDY()) {
3057 Register Reg = Info.addWorkGroupIDY();
3058 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3062 if (Info.hasWorkGroupIDZ()) {
3063 Register Reg = Info.addWorkGroupIDZ();
3064 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3069 if (Info.hasWorkGroupInfo()) {
3070 Register Reg = Info.addWorkGroupInfo();
3071 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3075 if (Info.hasPrivateSegmentWaveByteOffset()) {
3077 unsigned PrivateSegmentWaveByteOffsetReg;
3080 PrivateSegmentWaveByteOffsetReg =
3081 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3085 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3087 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3090 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3092 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3093 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3096 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3097 Info.getNumPreloadedSGPRs() >= 16);
3112 if (HasStackObjects)
3113 Info.setHasNonSpillStackObjects(
true);
3118 HasStackObjects =
true;
3122 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3124 if (!ST.enableFlatScratch()) {
3125 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3132 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3134 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3144 Info.setScratchRSrcReg(ReservedBufferReg);
3163 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
3164 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3171 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3172 if (!
MRI.isLiveIn(
Reg)) {
3173 Info.setStackPtrOffsetReg(
Reg);
3178 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3185 if (ST.getFrameLowering()->hasFP(MF)) {
3186 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3202 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3211 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3212 RC = &AMDGPU::SGPR_64RegClass;
3213 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3214 RC = &AMDGPU::SGPR_32RegClass;
3220 Entry->addLiveIn(*
I);
3225 for (
auto *Exit : Exits)
3227 TII->get(TargetOpcode::COPY), *
I)
3242 bool IsError =
false;
3246 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3264 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3265 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3267 if (!Subtarget->enableFlatScratch())
3272 !Subtarget->hasArchitectedSGPRs())
3273 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3274 !Info->hasWorkGroupIDZ());
3277 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3295 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3296 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3299 Info->markPSInputAllocated(0);
3300 Info->markPSInputEnabled(0);
3302 if (Subtarget->isAmdPalOS()) {
3311 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3312 if ((PsInputBits & 0x7F) == 0 ||
3313 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3316 }
else if (IsKernel) {
3317 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3319 Splits.
append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3329 if (IsKernel && Subtarget->hasKernargPreload())
3333 }
else if (!IsGraphics) {
3338 if (!Subtarget->enableFlatScratch())
3350 Info->setNumWaveDispatchSGPRs(
3352 Info->setNumWaveDispatchVGPRs(
3354 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3355 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3360 if (IsWholeWaveFunc) {
3362 {MVT::i1, MVT::Other}, Chain);
3374 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3385 if (IsEntryFunc && VA.
isMemLoc()) {
3408 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3412 int64_t OffsetDiff =
Offset - AlignDownOffset;
3419 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3429 ArgVal = DAG.
getNode(ISD::BITCAST,
DL, MemVT, ArgVal);
3430 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3431 Ins[i].Flags.isSExt(), &Ins[i]);
3439 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3442 if (PreloadRegs.
size() == 1) {
3443 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3448 TRI->getRegSizeInBits(*RC)));
3456 for (
auto Reg : PreloadRegs) {
3463 PreloadRegs.size()),
3480 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3481 Ins[i].Flags.isSExt(), &Ins[i]);
3493 "hidden argument in kernel signature was not preloaded",
3499 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3500 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3520 if (!IsEntryFunc && VA.
isMemLoc()) {
3521 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3532 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3533 RC = &AMDGPU::VGPR_32RegClass;
3534 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3535 RC = &AMDGPU::SGPR_32RegClass;
3555 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3571 Info->setBytesInStackArgArea(StackArgSize);
3573 return Chains.
empty() ? Chain
3582 const Type *RetTy)
const {
3590 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3595 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3596 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3597 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3598 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3621 Info->setIfReturnsVoid(Outs.
empty());
3622 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3641 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3642 ++
I, ++RealRVLocIdx) {
3646 SDValue Arg = OutVals[RealRVLocIdx];
3669 ReadFirstLane, Arg);
3676 if (!Info->isEntryFunction()) {
3682 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3684 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3785 auto &ArgUsageInfo =
3787 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3813 const auto [OutgoingArg, ArgRC, ArgTy] =
3818 const auto [IncomingArg, IncomingArgRC, Ty] =
3820 assert(IncomingArgRC == ArgRC);
3823 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3831 InputReg = getImplicitArgPtr(DAG,
DL);
3833 std::optional<uint32_t> Id =
3835 if (Id.has_value()) {
3846 if (OutgoingArg->isRegister()) {
3847 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3848 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3851 unsigned SpecialArgOffset =
3862 auto [OutgoingArg, ArgRC, Ty] =
3865 std::tie(OutgoingArg, ArgRC, Ty) =
3868 std::tie(OutgoingArg, ArgRC, Ty) =
3883 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3884 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3885 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3890 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3898 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
3908 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
3917 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3918 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3929 : IncomingArgY ? *IncomingArgY
3936 if (OutgoingArg->isRegister()) {
3938 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3964 if (Callee->isDivergent())
3971 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3975 if (!CallerPreserved)
3978 bool CCMatch = CallerCC == CalleeCC;
3991 if (Arg.hasByValAttr())
4005 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
4006 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4015 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4028 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4030 if (!CCVA.isRegLoc())
4035 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4037 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4061enum ChainCallArgIdx {
4083 bool UsesDynamicVGPRs =
false;
4084 if (IsChainCallConv) {
4089 auto RequestedExecIt =
4091 return Arg.OrigArgIndex == 2;
4093 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4095 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4098 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4101 "Haven't popped all the special args");
4104 CLI.
Args[ChainCallArgIdx::Exec];
4105 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4113 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4115 ChainCallSpecialArgs.
push_back(Arg.Node);
4118 PushNodeOrTargetConstant(RequestedExecArg);
4124 if (FlagsValue.
isZero()) {
4125 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4127 "no additional args allowed if flags == 0");
4129 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4133 if (!Subtarget->isWave32()) {
4135 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4138 UsesDynamicVGPRs =
true;
4139 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4140 CLI.
Args.end(), PushNodeOrTargetConstant);
4149 bool IsSibCall =
false;
4163 "unsupported call to variadic function ");
4171 "unsupported required tail call to function ");
4176 Outs, OutVals, Ins, DAG);
4180 "site marked musttail or on llvm.amdgcn.cs.chain");
4187 if (!TailCallOpt && IsTailCall)
4227 auto *
TRI = Subtarget->getRegisterInfo();
4234 if (!IsSibCall || IsChainCallConv) {
4235 if (!Subtarget->enableFlatScratch()) {
4241 RegsToPass.emplace_back(IsChainCallConv
4242 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4243 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4250 const unsigned NumSpecialInputs = RegsToPass.size();
4252 MVT PtrVT = MVT::i32;
4255 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4283 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4291 int32_t
Offset = LocMemOffset;
4298 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4304 ? Flags.getNonZeroByValAlign()
4331 if (Outs[i].Flags.isByVal()) {
4333 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4336 Outs[i].Flags.getNonZeroByValAlign(),
4338 nullptr, std::nullopt, DstInfo,
4344 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4350 if (!MemOpChains.
empty())
4358 TokenGlue = DAG.
getNode(ISD::CONVERGENCECTRL_GLUE,
DL, MVT::Glue,
4366 unsigned ArgIdx = 0;
4367 for (
auto [Reg, Val] : RegsToPass) {
4368 if (ArgIdx++ >= NumSpecialInputs &&
4369 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4395 if (IsTailCall && !IsSibCall) {
4400 std::vector<SDValue>
Ops({Chain});
4406 Ops.push_back(Callee);
4423 Ops.push_back(Callee);
4434 if (IsChainCallConv)
4439 for (
auto &[Reg, Val] : RegsToPass)
4443 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4444 assert(Mask &&
"Missing call preserved mask for calling convention");
4454 MVT::Glue, GlueOps),
4459 Ops.push_back(InGlue);
4479 if (Info->isWholeWaveFunction())
4487 Chain =
Call.getValue(0);
4488 InGlue =
Call.getValue(1);
4490 uint64_t CalleePopBytes = NumBytes;
4511 EVT VT =
Op.getValueType();
4525 "Stack grows upwards for AMDGPU");
4527 Chain = BaseAddr.getValue(1);
4529 if (Alignment > StackAlign) {
4531 << Subtarget->getWavefrontSizeLog2();
4532 uint64_t StackAlignMask = ScaledAlignment - 1;
4539 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4545 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4556 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4572 if (
Op.getValueType() != MVT::i32)
4591 assert(
Op.getValueType() == MVT::i32);
4600 Op.getOperand(0), IntrinID, GetRoundBothImm);
4634 SDValue RoundModeTimesNumBits =
4654 TableEntry, EnumOffset);
4670 static_cast<uint32_t>(ConstMode->getZExtValue()),
4682 if (UseReducedTable) {
4688 SDValue RoundModeTimesNumBits =
4708 SDValue RoundModeTimesNumBits =
4717 NewMode = TruncTable;
4726 ReadFirstLaneID, NewMode);
4739 IntrinID, RoundBothImm, NewMode);
4745 if (
Op->isDivergent() &&
4746 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4756 if (Subtarget->hasSafeSmemPrefetch())
4764 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4773 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4774 EVT SrcVT = Src.getValueType();
4783 EVT DstVT =
Op.getValueType();
4787 return DAG.
getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4792 if (
Op.getValueType() != MVT::i64)
4806 Op.getOperand(0), IntrinID, ModeHwRegImm);
4808 Op.getOperand(0), IntrinID, TrapHwRegImm);
4815 SDValue Result = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4822 if (
Op.getOperand(1).getValueType() != MVT::i64)
4834 ReadFirstLaneID, NewModeReg);
4836 ReadFirstLaneID, NewTrapReg);
4838 unsigned ModeHwReg =
4841 unsigned TrapHwReg =
4849 IntrinID, ModeHwRegImm, NewModeReg);
4852 IntrinID, TrapHwRegImm, NewTrapReg);
4861 .
Case(
"m0", AMDGPU::M0)
4862 .
Case(
"exec", AMDGPU::EXEC)
4863 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4864 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4865 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4866 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4867 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4872 if (!Subtarget->hasFlatScrRegister() &&
4873 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4875 "\" for subtarget."));
4880 case AMDGPU::EXEC_LO:
4881 case AMDGPU::EXEC_HI:
4882 case AMDGPU::FLAT_SCR_LO:
4883 case AMDGPU::FLAT_SCR_HI:
4888 case AMDGPU::FLAT_SCR:
4907 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4916static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4938 auto Next = std::next(
I);
4949 MBB.addSuccessor(LoopBB);
4951 return std::pair(LoopBB, RemainderBB);
4958 auto I =
MI.getIterator();
4959 auto E = std::next(
I);
4981 Src->setIsKill(
false);
4991 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
4997 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5000 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5024 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5025 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5035 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
5036 Register NewExec =
MRI.createVirtualRegister(BoolRC);
5038 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5039 Register CondReg =
MRI.createVirtualRegister(BoolRC);
5047 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5054 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5058 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5066 MRI.setSimpleHint(NewExec, CondReg);
5068 if (UseGPRIdxMode) {
5070 SGPRIdxReg = CurrentIdxReg;
5072 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5073 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5083 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5114 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5115 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5123 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5125 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
5126 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
5142 InitResultReg, DstReg, PhiReg, TmpExec,
5143 Offset, UseGPRIdxMode, SGPRIdxReg);
5149 LoopBB->removeSuccessor(RemainderBB);
5151 LoopBB->addSuccessor(LandingPad);
5162static std::pair<unsigned, int>
5166 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5171 return std::pair(AMDGPU::sub0,
Offset);
5211 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5228 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5229 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5238 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5241 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5245 if (UseGPRIdxMode) {
5252 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5265 MI.eraseFromParent();
5274 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5275 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5281 UseGPRIdxMode, SGPRIdxReg);
5285 if (UseGPRIdxMode) {
5287 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5289 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5294 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5299 MI.eraseFromParent();
5316 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5326 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5328 if (Idx->
getReg() == AMDGPU::NoRegister) {
5339 MI.eraseFromParent();
5344 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5348 if (UseGPRIdxMode) {
5352 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5361 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5362 TRI.getRegSizeInBits(*VecRC), 32,
false);
5368 MI.eraseFromParent();
5378 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5382 UseGPRIdxMode, SGPRIdxReg);
5385 if (UseGPRIdxMode) {
5387 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5389 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5395 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5396 TRI.getRegSizeInBits(*VecRC), 32,
false);
5397 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5403 MI.eraseFromParent();
5419 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5420 if (ST.hasScalarAddSub64()) {
5421 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5431 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5432 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5435 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5437 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5440 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5442 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5444 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5445 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5454 MI.eraseFromParent();
5460 case AMDGPU::S_MIN_U32:
5461 return std::numeric_limits<uint32_t>::max();
5462 case AMDGPU::S_MIN_I32:
5463 return std::numeric_limits<int32_t>::max();
5464 case AMDGPU::S_MAX_U32:
5465 return std::numeric_limits<uint32_t>::min();
5466 case AMDGPU::S_MAX_I32:
5467 return std::numeric_limits<int32_t>::min();
5468 case AMDGPU::S_ADD_I32:
5469 case AMDGPU::S_SUB_I32:
5470 case AMDGPU::S_OR_B32:
5471 case AMDGPU::S_XOR_B32:
5472 return std::numeric_limits<uint32_t>::min();
5473 case AMDGPU::S_AND_B32:
5474 return std::numeric_limits<uint32_t>::max();
5477 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5483 case AMDGPU::V_CMP_LT_U64_e64:
5484 return std::numeric_limits<uint64_t>::max();
5485 case AMDGPU::V_CMP_LT_I64_e64:
5486 return std::numeric_limits<int64_t>::max();
5487 case AMDGPU::V_CMP_GT_U64_e64:
5488 return std::numeric_limits<uint64_t>::min();
5489 case AMDGPU::V_CMP_GT_I64_e64:
5490 return std::numeric_limits<int64_t>::min();
5491 case AMDGPU::S_ADD_U64_PSEUDO:
5492 case AMDGPU::S_SUB_U64_PSEUDO:
5493 case AMDGPU::S_OR_B64:
5494 case AMDGPU::S_XOR_B64:
5495 return std::numeric_limits<uint64_t>::min();
5496 case AMDGPU::S_AND_B64:
5497 return std::numeric_limits<uint64_t>::max();
5500 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5505 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5506 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5507 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5508 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5509 Opc == AMDGPU::S_XOR_B32;
5523 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5528 case AMDGPU::S_MIN_U32:
5529 case AMDGPU::S_MIN_I32:
5530 case AMDGPU::S_MAX_U32:
5531 case AMDGPU::S_MAX_I32:
5532 case AMDGPU::S_AND_B32:
5533 case AMDGPU::S_OR_B32: {
5539 case AMDGPU::V_CMP_LT_U64_e64:
5540 case AMDGPU::V_CMP_LT_I64_e64:
5541 case AMDGPU::V_CMP_GT_U64_e64:
5542 case AMDGPU::V_CMP_GT_I64_e64:
5543 case AMDGPU::S_AND_B64:
5544 case AMDGPU::S_OR_B64: {
5550 case AMDGPU::S_XOR_B32:
5551 case AMDGPU::S_XOR_B64:
5552 case AMDGPU::S_ADD_I32:
5553 case AMDGPU::S_ADD_U64_PSEUDO:
5554 case AMDGPU::S_SUB_I32:
5555 case AMDGPU::S_SUB_U64_PSEUDO: {
5558 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5560 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5562 bool IsWave32 = ST.isWave32();
5563 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5564 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5565 unsigned BitCountOpc =
5566 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5570 auto NewAccumulator =
5575 case AMDGPU::S_XOR_B32:
5576 case AMDGPU::S_XOR_B64: {
5582 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5585 .
addReg(NewAccumulator->getOperand(0).getReg())
5588 if (
Opc == AMDGPU::S_XOR_B32) {
5594 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5596 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5600 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5603 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5605 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5615 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5623 case AMDGPU::S_SUB_I32: {
5624 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5632 .
addReg(NewAccumulator->getOperand(0).getReg());
5635 case AMDGPU::S_ADD_I32: {
5638 .
addReg(NewAccumulator->getOperand(0).getReg());
5641 case AMDGPU::S_ADD_U64_PSEUDO:
5642 case AMDGPU::S_SUB_U64_PSEUDO: {
5643 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5644 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5646 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5648 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5649 Register CarryReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5650 Register AddReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5652 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5654 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5658 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5661 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5663 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5665 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5668 .
addReg(NewAccumulator->getOperand(0).getReg())
5678 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5680 : NewAccumulator->getOperand(0).getReg();
5691 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5697 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5703 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5735 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5736 Register IdentityValReg =
MRI.createVirtualRegister(DstRegClass);
5737 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5738 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5739 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5740 Register FF1Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5741 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
5743 bool IsWave32 = ST.isWave32();
5744 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5745 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5752 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5756 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5765 I = ComputeLoop->begin();
5767 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5771 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5775 I = ComputeLoop->end();
5778 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5782 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5791 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5793 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5794 Register LaneValReg =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5797 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5799 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5801 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5803 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5807 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5811 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5812 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5818 case AMDGPU::S_OR_B64:
5819 case AMDGPU::S_AND_B64:
5820 case AMDGPU::S_XOR_B64: {
5823 .
addReg(LaneValue->getOperand(0).getReg())
5827 case AMDGPU::V_CMP_GT_I64_e64:
5828 case AMDGPU::V_CMP_GT_U64_e64:
5829 case AMDGPU::V_CMP_LT_I64_e64:
5830 case AMDGPU::V_CMP_LT_U64_e64: {
5831 Register LaneMaskReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5833 MRI.createVirtualRegister(WaveMaskRegClass);
5836 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5837 Register AccumulatorVReg =
MRI.createVirtualRegister(VregClass);
5840 VregClass, AMDGPU::sub0, VSubRegClass);
5843 VregClass, AMDGPU::sub1, VSubRegClass);
5844 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
5851 .
addReg(LaneValue->getOperand(0).getReg())
5852 .
addReg(AccumulatorVReg);
5854 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5855 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
5859 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5860 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5861 .
addReg(LaneValue->getOperand(0).getReg())
5865 case AMDGPU::S_ADD_U64_PSEUDO:
5866 case AMDGPU::S_SUB_U64_PSEUDO: {
5869 .
addReg(LaneValue->getOperand(0).getReg());
5876 unsigned BITSETOpc =
5877 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5878 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5884 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5887 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5889 .
addReg(NewActiveBitsReg)
5891 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5896 MI.eraseFromParent();
5908 switch (
MI.getOpcode()) {
5909 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5911 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5913 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5915 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5917 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5919 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5921 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5923 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5925 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5927 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5929 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5931 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5933 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5935 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5937 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5939 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5941 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5943 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5945 case AMDGPU::S_UADDO_PSEUDO:
5946 case AMDGPU::S_USUBO_PSEUDO: {
5953 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5955 : AMDGPU::S_SUB_I32;
5966 MI.eraseFromParent();
5969 case AMDGPU::S_ADD_U64_PSEUDO:
5970 case AMDGPU::S_SUB_U64_PSEUDO: {
5973 case AMDGPU::V_ADD_U64_PSEUDO:
5974 case AMDGPU::V_SUB_U64_PSEUDO: {
5980 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5986 if (ST.hasAddSubU64Insts()) {
5988 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5989 : AMDGPU::V_SUB_U64_e64),
5994 TII->legalizeOperands(*
I);
5995 MI.eraseFromParent();
5999 if (IsAdd && ST.hasLshlAddU64Inst()) {
6005 TII->legalizeOperands(*
Add);
6006 MI.eraseFromParent();
6010 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6012 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6013 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6015 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
6016 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
6020 : &AMDGPU::VReg_64RegClass;
6023 : &AMDGPU::VReg_64RegClass;
6026 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6028 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6031 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6033 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6036 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6038 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6041 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6048 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6062 TII->legalizeOperands(*LoHalf);
6063 TII->legalizeOperands(*HiHalf);
6064 MI.eraseFromParent();
6067 case AMDGPU::S_ADD_CO_PSEUDO:
6068 case AMDGPU::S_SUB_CO_PSEUDO: {
6082 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
6083 ? AMDGPU::S_ADDC_U32
6084 : AMDGPU::S_SUBB_U32;
6086 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6087 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6092 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6093 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6097 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6099 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6105 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
6106 assert(WaveSize == 64 || WaveSize == 32);
6108 if (WaveSize == 64) {
6109 if (ST.hasScalarCompareEq64()) {
6115 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6117 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6119 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6120 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6122 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6143 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6149 MI.eraseFromParent();
6152 case AMDGPU::SI_INIT_M0: {
6155 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6158 MI.eraseFromParent();
6161 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6164 TII->get(AMDGPU::S_CMP_EQ_U32))
6169 case AMDGPU::GET_GROUPSTATICSIZE: {
6174 .
add(
MI.getOperand(0))
6176 MI.eraseFromParent();
6179 case AMDGPU::GET_SHADERCYCLESHILO: {
6194 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6196 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6197 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6199 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6200 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6202 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6206 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6211 .
add(
MI.getOperand(0))
6216 MI.eraseFromParent();
6219 case AMDGPU::SI_INDIRECT_SRC_V1:
6220 case AMDGPU::SI_INDIRECT_SRC_V2:
6221 case AMDGPU::SI_INDIRECT_SRC_V4:
6222 case AMDGPU::SI_INDIRECT_SRC_V8:
6223 case AMDGPU::SI_INDIRECT_SRC_V9:
6224 case AMDGPU::SI_INDIRECT_SRC_V10:
6225 case AMDGPU::SI_INDIRECT_SRC_V11:
6226 case AMDGPU::SI_INDIRECT_SRC_V12:
6227 case AMDGPU::SI_INDIRECT_SRC_V16:
6228 case AMDGPU::SI_INDIRECT_SRC_V32:
6230 case AMDGPU::SI_INDIRECT_DST_V1:
6231 case AMDGPU::SI_INDIRECT_DST_V2:
6232 case AMDGPU::SI_INDIRECT_DST_V4:
6233 case AMDGPU::SI_INDIRECT_DST_V8:
6234 case AMDGPU::SI_INDIRECT_DST_V9:
6235 case AMDGPU::SI_INDIRECT_DST_V10:
6236 case AMDGPU::SI_INDIRECT_DST_V11:
6237 case AMDGPU::SI_INDIRECT_DST_V12:
6238 case AMDGPU::SI_INDIRECT_DST_V16:
6239 case AMDGPU::SI_INDIRECT_DST_V32:
6241 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6242 case AMDGPU::SI_KILL_I1_PSEUDO:
6244 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6253 Register SrcCond =
MI.getOperand(3).getReg();
6255 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6256 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6257 const auto *CondRC =
TRI->getWaveMaskRegClass();
6258 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
6262 : &AMDGPU::VReg_64RegClass;
6265 : &AMDGPU::VReg_64RegClass;
6268 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6270 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6273 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6275 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6278 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6280 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6301 MI.eraseFromParent();
6304 case AMDGPU::SI_BR_UNDEF: {
6308 .
add(
MI.getOperand(0));
6310 MI.eraseFromParent();
6313 case AMDGPU::ADJCALLSTACKUP:
6314 case AMDGPU::ADJCALLSTACKDOWN: {
6321 case AMDGPU::SI_CALL_ISEL: {
6325 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
6328 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6334 MI.eraseFromParent();
6337 case AMDGPU::V_ADD_CO_U32_e32:
6338 case AMDGPU::V_SUB_CO_U32_e32:
6339 case AMDGPU::V_SUBREV_CO_U32_e32: {
6342 unsigned Opc =
MI.getOpcode();
6344 bool NeedClampOperand =
false;
6345 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
6347 NeedClampOperand =
true;
6351 if (
TII->isVOP3(*
I)) {
6356 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
6357 if (NeedClampOperand)
6360 TII->legalizeOperands(*
I);
6362 MI.eraseFromParent();
6365 case AMDGPU::V_ADDC_U32_e32:
6366 case AMDGPU::V_SUBB_U32_e32:
6367 case AMDGPU::V_SUBBREV_U32_e32:
6370 TII->legalizeOperands(
MI);
6372 case AMDGPU::DS_GWS_INIT:
6373 case AMDGPU::DS_GWS_SEMA_BR:
6374 case AMDGPU::DS_GWS_BARRIER:
6375 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
6377 case AMDGPU::DS_GWS_SEMA_V:
6378 case AMDGPU::DS_GWS_SEMA_P:
6379 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6387 case AMDGPU::S_SETREG_B32: {
6403 const unsigned SetMask = WidthMask <<
Offset;
6406 unsigned SetDenormOp = 0;
6407 unsigned SetRoundOp = 0;
6415 SetRoundOp = AMDGPU::S_ROUND_MODE;
6416 SetDenormOp = AMDGPU::S_DENORM_MODE;
6418 SetRoundOp = AMDGPU::S_ROUND_MODE;
6420 SetDenormOp = AMDGPU::S_DENORM_MODE;
6423 if (SetRoundOp || SetDenormOp) {
6426 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6427 unsigned ImmVal = Def->getOperand(1).getImm();
6441 MI.eraseFromParent();
6450 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6454 case AMDGPU::S_INVERSE_BALLOT_U32:
6455 case AMDGPU::S_INVERSE_BALLOT_U64:
6458 MI.setDesc(
TII->get(AMDGPU::COPY));
6460 case AMDGPU::ENDPGM_TRAP: {
6463 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6483 MI.eraseFromParent();
6486 case AMDGPU::SIMULATED_TRAP: {
6487 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6490 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6491 MI.eraseFromParent();
6494 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6495 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6501 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6502 Register OriginalExec = Setup->getOperand(0).getReg();
6504 MI.getOperand(0).setReg(OriginalExec);
6541 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6545 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6572 if (!Subtarget->hasMadMacF32Insts())
6573 return Subtarget->hasFastFMAF32();
6579 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6582 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6598 switch (Ty.getScalarSizeInBits()) {
6616 if (Ty.getScalarSizeInBits() == 16)
6618 if (Ty.getScalarSizeInBits() == 32)
6619 return Subtarget->hasMadMacF32Insts() &&
6629 EVT VT =
N->getValueType(0);
6631 return Subtarget->hasMadMacF32Insts() &&
6633 if (VT == MVT::f16) {
6634 return Subtarget->hasMadF16() &&
6649 unsigned Opc =
Op.getOpcode();
6650 EVT VT =
Op.getValueType();
6651 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6652 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6653 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6654 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6670 [[maybe_unused]]
EVT VT =
Op.getValueType();
6672 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6673 VT == MVT::v16i32) &&
6674 "Unexpected ValueType.");
6683 unsigned Opc =
Op.getOpcode();
6684 EVT VT =
Op.getValueType();
6685 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6686 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6687 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6688 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6689 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6690 VT == MVT::v32bf16);
6698 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6700 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6707 unsigned Opc =
Op.getOpcode();
6708 EVT VT =
Op.getValueType();
6709 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6710 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6711 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6712 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6713 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6714 VT == MVT::v32bf16);
6719 : std::pair(Op0, Op0);
6728 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
6730 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
6736 switch (
Op.getOpcode()) {
6740 return LowerBRCOND(
Op, DAG);
6742 return LowerRETURNADDR(
Op, DAG);
6745 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6746 "Load should return a value and a chain");
6750 EVT VT =
Op.getValueType();
6752 return lowerFSQRTF32(
Op, DAG);
6754 return lowerFSQRTF64(
Op, DAG);
6759 return LowerTrig(
Op, DAG);
6761 return LowerSELECT(
Op, DAG);
6763 return LowerFDIV(
Op, DAG);
6765 return LowerFFREXP(
Op, DAG);
6766 case ISD::ATOMIC_CMP_SWAP:
6767 return LowerATOMIC_CMP_SWAP(
Op, DAG);
6769 return LowerSTORE(
Op, DAG);
6773 return LowerGlobalAddress(MFI,
Op, DAG);
6776 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
6778 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
6780 return LowerINTRINSIC_VOID(
Op, DAG);
6781 case ISD::ADDRSPACECAST:
6782 return lowerADDRSPACECAST(
Op, DAG);
6784 return lowerINSERT_SUBVECTOR(
Op, DAG);
6786 return lowerINSERT_VECTOR_ELT(
Op, DAG);
6788 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
6790 return lowerVECTOR_SHUFFLE(
Op, DAG);
6792 return lowerSCALAR_TO_VECTOR(
Op, DAG);
6794 return lowerBUILD_VECTOR(
Op, DAG);
6797 return lowerFP_ROUND(
Op, DAG);
6799 return lowerTRAP(
Op, DAG);
6800 case ISD::DEBUGTRAP:
6801 return lowerDEBUGTRAP(
Op, DAG);
6810 return lowerFMINNUM_FMAXNUM(
Op, DAG);
6811 case ISD::FMINIMUMNUM:
6812 case ISD::FMAXIMUMNUM:
6813 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
6816 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
6819 return lowerFLDEXP(
Op, DAG);
6836 case ISD::FMINNUM_IEEE:
6837 case ISD::FMAXNUM_IEEE:
6844 return lowerFCOPYSIGN(
Op, DAG);
6846 return lowerMUL(
Op, DAG);
6849 return lowerXMULO(
Op, DAG);
6852 return lowerXMUL_LOHI(
Op, DAG);
6853 case ISD::DYNAMIC_STACKALLOC:
6855 case ISD::STACKSAVE:
6859 case ISD::SET_ROUNDING:
6863 case ISD::FP_EXTEND:
6866 case ISD::GET_FPENV:
6868 case ISD::SET_FPENV:
6887 EVT FittingLoadVT = LoadVT;
6912 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6916 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6919SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6922 bool IsIntrinsic)
const {
6925 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6926 EVT LoadVT =
M->getValueType(0);
6928 EVT EquivLoadVT = LoadVT;
6942 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
6946 M->getMemoryVT(),
M->getMemOperand());
6957 EVT LoadVT =
M->getValueType(0);
6963 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
6964 bool IsTFE =
M->getNumValues() == 3;
6977 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
6981 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
6982 M->getMemOperand(), DAG);
6986 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
6988 M->getMemOperand(), DAG);
6996 EVT VT =
N->getValueType(0);
6997 unsigned CondCode =
N->getConstantOperandVal(3);
7008 EVT CmpVT =
LHS.getValueType();
7009 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
7010 unsigned PromoteOp =
7030 EVT VT =
N->getValueType(0);
7032 unsigned CondCode =
N->getConstantOperandVal(3);
7041 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7042 Src0 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7043 Src1 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7059 EVT VT =
N->getValueType(0);
7066 Src.getOperand(1), Src.getOperand(2));
7077 Exec = AMDGPU::EXEC_LO;
7079 Exec = AMDGPU::EXEC;
7096 EVT VT =
N->getValueType(0);
7098 unsigned IID =
N->getConstantOperandVal(0);
7099 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7100 IID == Intrinsic::amdgcn_permlanex16;
7101 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7102 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7106 unsigned SplitSize = 32;
7107 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7108 ST->hasDPALU_DPP() &&
7116 case Intrinsic::amdgcn_permlane16:
7117 case Intrinsic::amdgcn_permlanex16:
7118 case Intrinsic::amdgcn_update_dpp:
7123 case Intrinsic::amdgcn_writelane:
7126 case Intrinsic::amdgcn_readlane:
7127 case Intrinsic::amdgcn_set_inactive:
7128 case Intrinsic::amdgcn_set_inactive_chain_arg:
7129 case Intrinsic::amdgcn_mov_dpp8:
7132 case Intrinsic::amdgcn_readfirstlane:
7133 case Intrinsic::amdgcn_permlane64:
7143 if (
SDNode *GL =
N->getGluedNode()) {
7144 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7145 GL = GL->getOperand(0).getNode();
7146 Operands.push_back(DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7155 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7156 IID == Intrinsic::amdgcn_mov_dpp8 ||
7157 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7158 Src1 =
N->getOperand(2);
7159 if (IID == Intrinsic::amdgcn_writelane ||
7160 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7161 Src2 =
N->getOperand(3);
7164 if (ValSize == SplitSize) {
7174 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7179 if (IID == Intrinsic::amdgcn_writelane) {
7184 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7186 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7189 if (ValSize % SplitSize != 0)
7193 EVT VT =
N->getValueType(0);
7197 unsigned NumOperands =
N->getNumOperands();
7199 SDNode *GL =
N->getGluedNode();
7204 for (
unsigned i = 0; i != NE; ++i) {
7205 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7207 SDValue Operand =
N->getOperand(j);
7222 DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7237 if (SplitSize == 32) {
7239 return unrollLaneOp(LaneOp.
getNode());
7245 unsigned SubVecNumElt =
7249 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7250 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7254 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7259 if (IID == Intrinsic::amdgcn_writelane)
7264 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7265 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7266 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7267 EltIdx += SubVecNumElt;
7281 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7284 if (IID == Intrinsic::amdgcn_writelane)
7287 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7295 switch (
N->getOpcode()) {
7307 unsigned IID =
N->getConstantOperandVal(0);
7309 case Intrinsic::amdgcn_make_buffer_rsrc:
7310 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
7312 case Intrinsic::amdgcn_cvt_pkrtz: {
7318 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7321 case Intrinsic::amdgcn_cvt_pknorm_i16:
7322 case Intrinsic::amdgcn_cvt_pknorm_u16:
7323 case Intrinsic::amdgcn_cvt_pk_i16:
7324 case Intrinsic::amdgcn_cvt_pk_u16: {
7330 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7332 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7334 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7339 EVT VT =
N->getValueType(0);
7344 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7348 case Intrinsic::amdgcn_s_buffer_load: {
7354 if (!Subtarget->hasScalarSubwordLoads())
7360 EVT VT =
Op.getValueType();
7361 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
7373 if (!
Offset->isDivergent()) {
7392 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
7397 case Intrinsic::amdgcn_dead: {
7398 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7409 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7410 Results.push_back(Res.getOperand(
I));
7414 Results.push_back(Res.getValue(1));
7423 EVT VT =
N->getValueType(0);
7428 EVT SelectVT = NewVT;
7429 if (NewVT.
bitsLT(MVT::i32)) {
7432 SelectVT = MVT::i32;
7438 if (NewVT != SelectVT)
7444 if (
N->getValueType(0) != MVT::v2f16)
7448 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7456 if (
N->getValueType(0) != MVT::v2f16)
7460 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7468 if (
N->getValueType(0) != MVT::f16)
7483 if (U.get() !=
Value)
7486 if (U.getUser()->getOpcode() == Opcode)
7492unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
7495 case Intrinsic::amdgcn_if:
7497 case Intrinsic::amdgcn_else:
7499 case Intrinsic::amdgcn_loop:
7501 case Intrinsic::amdgcn_end_cf:
7521 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7548 SDNode *Intr = BRCOND.getOperand(1).getNode();
7561 assert(BR &&
"brcond missing unconditional branch user");
7565 unsigned CFNode = isCFIntrinsic(Intr);
7585 Ops.push_back(Target);
7608 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
7627 MVT VT =
Op.getSimpleValueType();
7630 if (
Op.getConstantOperandVal(0) != 0)
7634 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7636 if (
Info->isEntryFunction())
7653 return Op.getValueType().bitsLE(VT)
7661 EVT DstVT =
Op.getValueType();
7668 unsigned Opc =
Op.getOpcode();
7680 EVT SrcVT = Src.getValueType();
7681 EVT DstVT =
Op.getValueType();
7684 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
7687 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
7694 if (DstVT == MVT::f16) {
7699 if (!Subtarget->has16BitInsts()) {
7702 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7704 if (
Op->getFlags().hasApproximateFuncs()) {
7711 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7715 "custom lower FP_ROUND for f16 or bf16");
7716 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
7729 EVT VT =
Op.getValueType();
7731 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7732 bool IsIEEEMode =
Info->getMode().IEEE;
7741 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7748SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
7750 EVT VT =
Op.getValueType();
7752 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7753 bool IsIEEEMode =
Info->getMode().IEEE;
7758 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7766 EVT VT =
Op.getValueType();
7770 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7771 !Subtarget->hasMinimum3Maximum3F16() &&
7772 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7773 "should not need to widen f16 minimum/maximum to v2f16");
7787 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7795 EVT VT =
Op.getValueType();
7799 EVT ExpVT =
Exp.getValueType();
7800 if (ExpVT == MVT::i16)
7821 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
7824 return DAG.
getNode(ISD::FLDEXP,
DL, VT,
Op.getOperand(0), TruncExp);
7828 switch (
Op->getOpcode()) {
7858 DAGCombinerInfo &DCI)
const {
7859 const unsigned Opc =
Op.getOpcode();
7867 :
Op->getOperand(0).getValueType();
7870 if (DCI.isBeforeLegalizeOps() ||
7874 auto &DAG = DCI.DAG;
7880 LHS =
Op->getOperand(1);
7881 RHS =
Op->getOperand(2);
7883 LHS =
Op->getOperand(0);
7884 RHS =
Op->getOperand(1);
7923 if (MagVT == SignVT)
7930 SDValue SignAsInt32 = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7933 SDValue SignAsHalf16 = DAG.
getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7940 EVT VT =
Op.getValueType();
7946 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
7973 if (
Op->isDivergent())
7986 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7988 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7991 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7993 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7999 EVT VT =
Op.getValueType();
8006 const APInt &
C = RHSC->getAPIntValue();
8008 if (
C.isPowerOf2()) {
8010 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
8037 if (
Op->isDivergent()) {
8041 if (Subtarget->hasSMulHi()) {
8052 if (!Subtarget->isTrapHandlerEnabled() ||
8054 return lowerTrapEndpgm(
Op, DAG);
8056 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
8057 : lowerTrapHsaQueuePtr(
Op, DAG);
8067SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
8069 ImplicitParameter Param)
const {
8089 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
8092 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8095 if (UserSGPR == AMDGPU::NoRegister) {
8121 if (Subtarget->hasPrivEnabledTrap2NopBug())
8134 if (!Subtarget->isTrapHandlerEnabled() ||
8138 "debugtrap handler not supported",
8149SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
8151 if (Subtarget->hasApertureRegs()) {
8153 ? AMDGPU::SRC_SHARED_BASE
8154 : AMDGPU::SRC_PRIVATE_BASE;
8155 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8156 !Subtarget->hasGloballyAddressableScratch()) &&
8157 "Cannot use src_private_base with globally addressable scratch!");
8178 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
8182 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8184 if (UserSGPR == AMDGPU::NoRegister) {
8218 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8229 const AMDGPUTargetMachine &TM =
8232 unsigned DestAS, SrcAS;
8234 bool IsNonNull =
false;
8236 SrcAS = ASC->getSrcAddressSpace();
8237 Src = ASC->getOperand(0);
8238 DestAS = ASC->getDestAddressSpace();
8241 Op.getConstantOperandVal(0) ==
8242 Intrinsic::amdgcn_addrspacecast_nonnull);
8243 Src =
Op->getOperand(1);
8244 SrcAS =
Op->getConstantOperandVal(2);
8245 DestAS =
Op->getConstantOperandVal(3);
8258 Subtarget->hasGloballyAddressableScratch()) {
8263 AMDGPU::S_MOV_B32, SL, MVT::i32,
8264 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8272 unsigned NullVal = TM.getNullPointerValue(DestAS);
8287 Subtarget->hasGloballyAddressableScratch()) {
8296 if (Subtarget->isWave64())
8302 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8305 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8310 AMDGPU::S_MOV_B64, SL, MVT::i64,
8311 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8313 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8315 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8317 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8323 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8335 Op.getValueType() == MVT::i64) {
8336 const SIMachineFunctionInfo *
Info =
8340 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8344 Src.getValueType() == MVT::i64)
8364 EVT InsVT =
Ins.getValueType();
8372 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
8377 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8379 MVT::i32, InsNumElts / 2);
8381 Vec = DAG.
getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8382 Ins = DAG.
getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8384 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
8386 if (InsNumElts == 2) {
8396 return DAG.
getNode(ISD::BITCAST, SL, VecVT, Vec);
8399 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
8422 if (NumElts == 4 && EltSize == 16 && KIdx) {
8430 SDValue LoVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8431 SDValue HiVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8433 unsigned Idx = KIdx->getZExtValue();
8434 bool InsertLo = Idx < 2;
8437 DAG.
getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8438 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8440 InsHalf = DAG.
getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8444 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8457 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
8485 return DAG.
getNode(ISD::BITCAST, SL, VecVT, BFI);
8492 EVT ResultVT =
Op.getValueType();
8505 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8508 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8512 if (VecSize == 128) {
8520 }
else if (VecSize == 256) {
8523 for (
unsigned P = 0;
P < 4; ++
P) {
8529 Parts[0], Parts[1]));
8531 Parts[2], Parts[3]));
8537 for (
unsigned P = 0;
P < 8; ++
P) {
8544 Parts[0], Parts[1], Parts[2], Parts[3]));
8547 Parts[4], Parts[5], Parts[6], Parts[7]));
8567 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8582 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8584 return DAG.
getNode(ISD::BITCAST, SL, ResultVT, Result);
8592 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8597 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8598 !(Mask[Elt + 1] & 1);
8604 EVT ResultVT =
Op.getValueType();
8607 const int NewSrcNumElts = 2;
8609 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
8625 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
8647 if (ShouldUseConsecutiveExtract &&
8650 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8651 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8663 if (Idx0 >= SrcNumElts) {
8668 if (Idx1 >= SrcNumElts) {
8673 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8674 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8682 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8683 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8688 if (SubVec0 != SubVec1) {
8689 NewMaskIdx1 += NewSrcNumElts;
8696 {NewMaskIdx0, NewMaskIdx1});
8701 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8702 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8703 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8704 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8723 EVT ResultVT =
Op.getValueType();
8739 EVT VT =
Op.getValueType();
8741 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8742 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
8751 return DAG.
getNode(ISD::BITCAST, SL, VT, ExtLo);
8760 return DAG.
getNode(ISD::BITCAST, SL, VT, ShlHi);
8767 return DAG.
getNode(ISD::BITCAST, SL, VT,
Or);
8776 for (
unsigned P = 0;
P < NumParts; ++
P) {
8778 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
8784 return DAG.
getNode(ISD::BITCAST, SL, VT, Blend);
8797 if (!Subtarget->isAmdHsaOS())
8857 EVT PtrVT =
Op.getValueType();
8859 const GlobalValue *GV = GSD->
getGlobal();
8873 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
8891 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8892 if (Subtarget->has64BitLiterals()) {
8923 MachinePointerInfo PtrInfo =
8951 SDValue Param = lowerKernargMemParameter(
8962 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
8970 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
8978 unsigned NumElts = Elts.
size();
8980 if (NumElts <= 12) {
8989 for (
unsigned i = 0; i < Elts.
size(); ++i) {
8995 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
9005 EVT SrcVT = Src.getValueType();
9026 bool Unpacked,
bool IsD16,
int DMaskPop,
9027 int NumVDataDwords,
bool IsAtomicPacked16Bit,
9031 EVT ReqRetVT = ResultTypes[0];
9033 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9034 ? (ReqRetNumElts + 1) / 2
9037 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9048 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
9059 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
9061 NumDataDwords - MaskPopDwords);
9066 EVT LegalReqRetVT = ReqRetVT;
9068 if (!
Data.getValueType().isInteger())
9070 Data.getValueType().changeTypeToInteger(),
Data);
9091 if (Result->getNumValues() == 1)
9098 SDValue *LWE,
bool &IsTexFail) {
9118 unsigned DimIdx,
unsigned EndIdx,
9119 unsigned NumGradients) {
9121 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
9129 if (((
I + 1) >= EndIdx) ||
9130 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
9131 I == DimIdx + NumGradients - 1))) {
9150 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9164 int NumVDataDwords = 0;
9165 bool AdjustRetType =
false;
9166 bool IsAtomicPacked16Bit =
false;
9169 const unsigned ArgOffset = WithChain ? 2 : 1;
9172 unsigned DMaskLanes = 0;
9174 if (BaseOpcode->Atomic) {
9175 VData =
Op.getOperand(2);
9177 IsAtomicPacked16Bit =
9178 (Intr->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9179 Intr->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
9182 if (BaseOpcode->AtomicX2) {
9189 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9190 DMask = Is64Bit ? 0xf : 0x3;
9191 NumVDataDwords = Is64Bit ? 4 : 2;
9193 DMask = Is64Bit ? 0x3 : 0x1;
9194 NumVDataDwords = Is64Bit ? 2 : 1;
9197 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
9200 if (BaseOpcode->Store) {
9201 VData =
Op.getOperand(2);
9205 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9209 VData = handleD16VData(VData, DAG,
true);
9212 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
9213 }
else if (!BaseOpcode->NoReturn) {
9218 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9226 (!LoadVT.
isVector() && DMaskLanes > 1))
9232 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9233 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9234 NumVDataDwords = (DMaskLanes + 1) / 2;
9236 NumVDataDwords = DMaskLanes;
9238 AdjustRetType =
true;
9242 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
9249 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9250 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9252 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
9254 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9255 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9259 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
9265 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
9269 "Bias needs to be converted to 16 bit in A16 mode");
9274 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
9278 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
9279 "require 16 bit args for both gradients and addresses");
9284 if (!
ST->hasA16()) {
9285 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
9286 "support 16 bit addresses\n");
9296 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
9298 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9300 IntrOpcode = G16MappingInfo->
G16;
9323 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
9341 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
9342 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
9343 const bool UseNSA =
ST->hasNSAEncoding() &&
9344 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
9345 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
9346 const bool UsePartialNSA =
9347 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
9350 if (UsePartialNSA) {
9352 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9353 }
else if (!UseNSA) {
9360 if (!BaseOpcode->Sampler) {
9363 uint64_t UnormConst =
9364 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
9366 Unorm = UnormConst ? True : False;
9372 bool IsTexFail =
false;
9373 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9384 NumVDataDwords += 1;
9385 AdjustRetType =
true;
9390 if (AdjustRetType) {
9393 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9402 MVT::i32, NumVDataDwords)
9405 ResultTypes[0] = NewVT;
9406 if (ResultTypes.size() == 3) {
9410 ResultTypes.erase(&ResultTypes[1]);
9415 if (BaseOpcode->Atomic)
9422 if (BaseOpcode->Store || BaseOpcode->Atomic)
9423 Ops.push_back(VData);
9424 if (UsePartialNSA) {
9426 Ops.push_back(VAddr);
9430 Ops.push_back(VAddr);
9433 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9435 Ops.push_back(Rsrc);
9436 if (BaseOpcode->Sampler) {
9440 Ops.push_back(Samp);
9445 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9446 Ops.push_back(Unorm);
9448 Ops.push_back(IsA16 &&
9449 ST->hasFeature(AMDGPU::FeatureR128A16)
9453 Ops.push_back(IsA16 ? True : False);
9455 if (!Subtarget->hasGFX90AInsts())
9460 "TFE is not supported on this GPU",
DL.getDebugLoc()));
9463 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9466 Ops.push_back(DimInfo->
DA ? True : False);
9467 if (BaseOpcode->HasD16)
9468 Ops.push_back(IsD16 ? True : False);
9470 Ops.push_back(
Op.getOperand(0));
9472 int NumVAddrDwords =
9478 NumVDataDwords, NumVAddrDwords);
9479 }
else if (IsGFX11Plus) {
9481 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9482 : AMDGPU::MIMGEncGfx11Default,
9483 NumVDataDwords, NumVAddrDwords);
9484 }
else if (IsGFX10Plus) {
9486 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9487 : AMDGPU::MIMGEncGfx10Default,
9488 NumVDataDwords, NumVAddrDwords);
9490 if (Subtarget->hasGFX90AInsts()) {
9492 NumVDataDwords, NumVAddrDwords);
9496 "requested image instruction is not supported on this GPU",
9501 for (EVT VT : OrigResultTypes) {
9502 if (VT == MVT::Other)
9503 RetValues[Idx++] =
Op.getOperand(0);
9514 NumVDataDwords, NumVAddrDwords);
9517 NumVDataDwords, NumVAddrDwords);
9524 MachineMemOperand *MemRef = MemOp->getMemOperand();
9528 if (BaseOpcode->AtomicX2) {
9533 if (BaseOpcode->NoReturn)
9536 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9537 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9550 MachinePointerInfo(),
9555 if (!
Offset->isDivergent()) {
9562 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9571 !Subtarget->hasScalarDwordx3Loads()) {
9598 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9600 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
9604 unsigned NumLoads = 1;
9610 if (NumElts == 8 || NumElts == 16) {
9611 NumLoads = NumElts / 4;
9615 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
9620 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
9622 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
9623 for (
unsigned i = 0; i < NumLoads; ++i) {
9629 if (NumElts == 8 || NumElts == 16)
9637 if (!Subtarget->hasArchitectedSGPRs())
9649 unsigned Width)
const {
9651 using namespace AMDGPU::Hwreg;
9653 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9692 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
9694 EVT VT =
Op.getValueType();
9696 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
9700 switch (IntrinsicID) {
9701 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9704 return getPreloadedValue(DAG, *MFI, VT,
9707 case Intrinsic::amdgcn_dispatch_ptr:
9708 case Intrinsic::amdgcn_queue_ptr: {
9709 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
9711 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
9716 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9719 return getPreloadedValue(DAG, *MFI, VT, RegID);
9721 case Intrinsic::amdgcn_implicitarg_ptr: {
9723 return getImplicitArgPtr(DAG,
DL);
9724 return getPreloadedValue(DAG, *MFI, VT,
9727 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9733 return getPreloadedValue(DAG, *MFI, VT,
9736 case Intrinsic::amdgcn_dispatch_id: {
9739 case Intrinsic::amdgcn_rcp:
9741 case Intrinsic::amdgcn_rsq:
9743 case Intrinsic::amdgcn_rsq_legacy:
9747 case Intrinsic::amdgcn_rcp_legacy:
9751 case Intrinsic::amdgcn_rsq_clamp: {
9762 return DAG.
getNode(ISD::FMAXNUM,
DL, VT, Tmp,
9765 case Intrinsic::r600_read_ngroups_x:
9766 if (Subtarget->isAmdHsaOS())
9769 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9772 case Intrinsic::r600_read_ngroups_y:
9773 if (Subtarget->isAmdHsaOS())
9776 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9779 case Intrinsic::r600_read_ngroups_z:
9780 if (Subtarget->isAmdHsaOS())
9783 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9786 case Intrinsic::r600_read_local_size_x:
9787 if (Subtarget->isAmdHsaOS())
9790 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9792 case Intrinsic::r600_read_local_size_y:
9793 if (Subtarget->isAmdHsaOS())
9796 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9798 case Intrinsic::r600_read_local_size_z:
9799 if (Subtarget->isAmdHsaOS())
9802 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9804 case Intrinsic::amdgcn_workgroup_id_x:
9805 return lowerWorkGroupId(DAG, *MFI, VT,
9809 case Intrinsic::amdgcn_workgroup_id_y:
9810 return lowerWorkGroupId(DAG, *MFI, VT,
9814 case Intrinsic::amdgcn_workgroup_id_z:
9815 return lowerWorkGroupId(DAG, *MFI, VT,
9819 case Intrinsic::amdgcn_cluster_id_x:
9820 return Subtarget->hasClusters()
9821 ? getPreloadedValue(DAG, *MFI, VT,
9823 : DAG.getPOISON(VT);
9824 case Intrinsic::amdgcn_cluster_id_y:
9825 return Subtarget->hasClusters()
9826 ? getPreloadedValue(DAG, *MFI, VT,
9829 case Intrinsic::amdgcn_cluster_id_z:
9830 return Subtarget->hasClusters()
9831 ? getPreloadedValue(DAG, *MFI, VT,
9834 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9835 return Subtarget->hasClusters()
9836 ? getPreloadedValue(
9840 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9841 return Subtarget->hasClusters()
9842 ? getPreloadedValue(
9846 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9847 return Subtarget->hasClusters()
9848 ? getPreloadedValue(
9852 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9853 return Subtarget->hasClusters()
9856 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9857 return Subtarget->hasClusters()
9858 ? getPreloadedValue(
9862 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9863 return Subtarget->hasClusters()
9864 ? getPreloadedValue(
9868 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9869 return Subtarget->hasClusters()
9870 ? getPreloadedValue(
9874 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9875 return Subtarget->hasClusters()
9876 ? getPreloadedValue(
9880 case Intrinsic::amdgcn_wave_id:
9881 return lowerWaveID(DAG,
Op);
9882 case Intrinsic::amdgcn_lds_kernel_id: {
9884 return getLDSKernelId(DAG,
DL);
9885 return getPreloadedValue(DAG, *MFI, VT,
9888 case Intrinsic::amdgcn_workitem_id_x:
9889 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
9890 case Intrinsic::amdgcn_workitem_id_y:
9891 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
9892 case Intrinsic::amdgcn_workitem_id_z:
9893 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
9894 case Intrinsic::amdgcn_wavefrontsize:
9896 SDLoc(
Op), MVT::i32);
9897 case Intrinsic::amdgcn_s_buffer_load: {
9898 unsigned CPol =
Op.getConstantOperandVal(3);
9905 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
9906 Op.getOperand(3), DAG);
9908 case Intrinsic::amdgcn_fdiv_fast:
9909 return lowerFDIV_FAST(
Op, DAG);
9910 case Intrinsic::amdgcn_sin:
9913 case Intrinsic::amdgcn_cos:
9916 case Intrinsic::amdgcn_mul_u24:
9919 case Intrinsic::amdgcn_mul_i24:
9923 case Intrinsic::amdgcn_log_clamp: {
9929 case Intrinsic::amdgcn_fract:
9932 case Intrinsic::amdgcn_class:
9935 case Intrinsic::amdgcn_div_fmas:
9937 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9939 case Intrinsic::amdgcn_div_fixup:
9941 Op.getOperand(2),
Op.getOperand(3));
9943 case Intrinsic::amdgcn_div_scale: {
9956 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
9959 Denominator, Numerator);
9961 case Intrinsic::amdgcn_icmp: {
9963 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
9964 Op.getConstantOperandVal(2) == 0 &&
9969 case Intrinsic::amdgcn_fcmp: {
9972 case Intrinsic::amdgcn_ballot:
9974 case Intrinsic::amdgcn_fmed3:
9976 Op.getOperand(2),
Op.getOperand(3));
9977 case Intrinsic::amdgcn_fdot2:
9979 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9980 case Intrinsic::amdgcn_fmul_legacy:
9983 case Intrinsic::amdgcn_sffbh:
9985 case Intrinsic::amdgcn_sbfe:
9987 Op.getOperand(2),
Op.getOperand(3));
9988 case Intrinsic::amdgcn_ubfe:
9990 Op.getOperand(2),
Op.getOperand(3));
9991 case Intrinsic::amdgcn_cvt_pkrtz:
9992 case Intrinsic::amdgcn_cvt_pknorm_i16:
9993 case Intrinsic::amdgcn_cvt_pknorm_u16:
9994 case Intrinsic::amdgcn_cvt_pk_i16:
9995 case Intrinsic::amdgcn_cvt_pk_u16: {
9997 EVT VT =
Op.getValueType();
10000 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10002 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10004 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10006 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10012 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
10015 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
10016 return DAG.
getNode(ISD::BITCAST,
DL, VT, Node);
10018 case Intrinsic::amdgcn_fmad_ftz:
10020 Op.getOperand(2),
Op.getOperand(3));
10022 case Intrinsic::amdgcn_if_break:
10024 Op->getOperand(1),
Op->getOperand(2)),
10027 case Intrinsic::amdgcn_groupstaticsize: {
10033 const GlobalValue *GV =
10039 case Intrinsic::amdgcn_is_shared:
10040 case Intrinsic::amdgcn_is_private: {
10043 DAG.
getNode(ISD::BITCAST,
DL, MVT::v2i32,
Op.getOperand(1));
10047 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10051 Subtarget->hasGloballyAddressableScratch()) {
10054 AMDGPU::S_MOV_B32,
DL, MVT::i32,
10055 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10064 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10067 case Intrinsic::amdgcn_perm:
10069 Op.getOperand(2),
Op.getOperand(3));
10070 case Intrinsic::amdgcn_reloc_constant: {
10080 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10081 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10082 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10083 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10084 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10085 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10086 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10087 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10088 if (
Op.getOperand(4).getValueType() == MVT::i32)
10094 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10095 Op.getOperand(3), IndexKeyi32);
10097 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10098 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10099 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10100 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10101 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10102 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10103 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10104 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10105 if (
Op.getOperand(4).getValueType() == MVT::i64)
10111 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10112 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10113 Op.getOperand(6)});
10115 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10116 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10117 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10118 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10119 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10120 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10121 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10124 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
10130 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10131 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10132 IndexKey, Op.getOperand(7),
10133 Op.getOperand(8)});
10135 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10136 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10137 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10138 if (
Op.getOperand(6).getValueType() == MVT::i32)
10144 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10145 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10146 IndexKeyi32, Op.getOperand(7)});
10148 case Intrinsic::amdgcn_addrspacecast_nonnull:
10149 return lowerADDRSPACECAST(
Op, DAG);
10150 case Intrinsic::amdgcn_readlane:
10151 case Intrinsic::amdgcn_readfirstlane:
10152 case Intrinsic::amdgcn_writelane:
10153 case Intrinsic::amdgcn_permlane16:
10154 case Intrinsic::amdgcn_permlanex16:
10155 case Intrinsic::amdgcn_permlane64:
10156 case Intrinsic::amdgcn_set_inactive:
10157 case Intrinsic::amdgcn_set_inactive_chain_arg:
10158 case Intrinsic::amdgcn_mov_dpp8:
10159 case Intrinsic::amdgcn_update_dpp:
10161 case Intrinsic::amdgcn_dead: {
10163 for (
const EVT ValTy :
Op.getNode()->values())
10168 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10170 return lowerImage(
Op, ImageDimIntr, DAG,
false);
10181 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10187 unsigned NewOpcode)
const {
10191 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10192 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10210 M->getMemOperand());
10215 unsigned NewOpcode)
const {
10219 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10220 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10238 M->getMemOperand());
10243 unsigned IntrID =
Op.getConstantOperandVal(1);
10247 case Intrinsic::amdgcn_ds_ordered_add:
10248 case Intrinsic::amdgcn_ds_ordered_swap: {
10253 unsigned IndexOperand =
M->getConstantOperandVal(7);
10254 unsigned WaveRelease =
M->getConstantOperandVal(8);
10255 unsigned WaveDone =
M->getConstantOperandVal(9);
10257 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10258 IndexOperand &= ~0x3f;
10259 unsigned CountDw = 0;
10262 CountDw = (IndexOperand >> 24) & 0xf;
10263 IndexOperand &= ~(0xf << 24);
10265 if (CountDw < 1 || CountDw > 4) {
10268 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
10269 DL.getDebugLoc()));
10274 if (IndexOperand) {
10277 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
10280 if (WaveDone && !WaveRelease) {
10284 Fn,
"ds_ordered_count: wave_done requires wave_release",
10285 DL.getDebugLoc()));
10288 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10289 unsigned ShaderType =
10291 unsigned Offset0 = OrderedCountIndex << 2;
10292 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10295 Offset1 |= (CountDw - 1) << 6;
10298 Offset1 |= ShaderType << 2;
10300 unsigned Offset = Offset0 | (Offset1 << 8);
10307 M->getVTList(),
Ops,
M->getMemoryVT(),
10308 M->getMemOperand());
10310 case Intrinsic::amdgcn_raw_buffer_load:
10311 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10312 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10313 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10314 case Intrinsic::amdgcn_raw_buffer_load_format:
10315 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10316 const bool IsFormat =
10317 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10318 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10320 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10321 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10335 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
10337 case Intrinsic::amdgcn_struct_buffer_load:
10338 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10339 case Intrinsic::amdgcn_struct_buffer_load_format:
10340 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10341 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10342 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10343 const bool IsFormat =
10344 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10345 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10347 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10348 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10363 case Intrinsic::amdgcn_raw_tbuffer_load:
10364 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10366 EVT LoadVT =
Op.getValueType();
10367 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10368 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10387 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10390 case Intrinsic::amdgcn_struct_tbuffer_load:
10391 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10393 EVT LoadVT =
Op.getValueType();
10394 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10395 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10414 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10417 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10418 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10420 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10421 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10422 return lowerStructBufferAtomicIntrin(
Op, DAG,
10424 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10425 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10427 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10428 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10429 return lowerStructBufferAtomicIntrin(
Op, DAG,
10431 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10432 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10434 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10435 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10436 return lowerStructBufferAtomicIntrin(
Op, DAG,
10438 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10439 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10441 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10442 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10444 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10445 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10447 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10448 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10450 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10451 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10453 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10454 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10456 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10457 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10459 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10460 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10462 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10463 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10465 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10466 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10468 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10469 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10471 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10472 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10474 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10475 return lowerRawBufferAtomicIntrin(
Op, DAG,
10477 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10478 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10479 return lowerStructBufferAtomicIntrin(
Op, DAG,
10481 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10482 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10484 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10485 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10487 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10488 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10489 return lowerStructBufferAtomicIntrin(
Op, DAG,
10491 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10492 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10493 return lowerStructBufferAtomicIntrin(
Op, DAG,
10495 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10496 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10497 return lowerStructBufferAtomicIntrin(
Op, DAG,
10499 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10500 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10501 return lowerStructBufferAtomicIntrin(
Op, DAG,
10503 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10504 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10506 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10507 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10509 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10510 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10512 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10513 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10515 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10516 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10518 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10519 return lowerStructBufferAtomicIntrin(
Op, DAG,
10522 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10523 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10524 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
10525 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10539 EVT VT =
Op.getValueType();
10543 Op->getVTList(),
Ops, VT,
10544 M->getMemOperand());
10546 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10547 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10548 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
10549 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
10563 EVT VT =
Op.getValueType();
10567 Op->getVTList(),
Ops, VT,
10568 M->getMemOperand());
10570 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10571 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10573 SDValue NodePtr =
M->getOperand(2);
10574 SDValue RayExtent =
M->getOperand(3);
10575 SDValue InstanceMask =
M->getOperand(4);
10576 SDValue RayOrigin =
M->getOperand(5);
10577 SDValue RayDir =
M->getOperand(6);
10579 SDValue TDescr =
M->getOperand(8);
10584 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10589 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10590 const unsigned NumVDataDwords = 10;
10591 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10593 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10594 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10595 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10599 Ops.push_back(NodePtr);
10602 {DAG.getBitcast(MVT::i32, RayExtent),
10603 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10604 Ops.push_back(RayOrigin);
10605 Ops.push_back(RayDir);
10606 Ops.push_back(Offsets);
10607 Ops.push_back(TDescr);
10608 Ops.push_back(
M->getChain());
10611 MachineMemOperand *MemRef =
M->getMemOperand();
10615 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10617 SDValue NodePtr =
M->getOperand(2);
10618 SDValue RayExtent =
M->getOperand(3);
10619 SDValue RayOrigin =
M->getOperand(4);
10620 SDValue RayDir =
M->getOperand(5);
10621 SDValue RayInvDir =
M->getOperand(6);
10622 SDValue TDescr =
M->getOperand(7);
10629 if (!Subtarget->hasGFX10_AEncoding()) {
10639 const unsigned NumVDataDwords = 4;
10640 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10641 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10642 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10645 const unsigned BaseOpcodes[2][2] = {
10646 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10647 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10648 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10652 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10653 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10654 : AMDGPU::MIMGEncGfx10NSA,
10655 NumVDataDwords, NumVAddrDwords);
10659 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10660 : AMDGPU::MIMGEncGfx10Default,
10661 NumVDataDwords, NumVAddrDwords);
10667 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
10670 if (Lanes[0].getValueSizeInBits() == 32) {
10671 for (
unsigned I = 0;
I < 3; ++
I)
10678 Ops.push_back(Lanes[2]);
10690 if (UseNSA && IsGFX11Plus) {
10691 Ops.push_back(NodePtr);
10693 Ops.push_back(RayOrigin);
10698 for (
unsigned I = 0;
I < 3; ++
I) {
10701 {DirLanes[I], InvDirLanes[I]})));
10705 Ops.push_back(RayDir);
10706 Ops.push_back(RayInvDir);
10713 Ops.push_back(NodePtr);
10716 packLanes(RayOrigin,
true);
10717 packLanes(RayDir,
true);
10718 packLanes(RayInvDir,
false);
10723 if (NumVAddrDwords > 12) {
10725 Ops.append(16 -
Ops.size(), Undef);
10731 Ops.push_back(MergedOps);
10734 Ops.push_back(TDescr);
10736 Ops.push_back(
M->getChain());
10739 MachineMemOperand *MemRef =
M->getMemOperand();
10743 case Intrinsic::amdgcn_global_atomic_fmin_num:
10744 case Intrinsic::amdgcn_global_atomic_fmax_num:
10745 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10746 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10753 unsigned Opcode = 0;
10755 case Intrinsic::amdgcn_global_atomic_fmin_num:
10756 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10757 Opcode = ISD::ATOMIC_LOAD_FMIN;
10760 case Intrinsic::amdgcn_global_atomic_fmax_num:
10761 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10762 Opcode = ISD::ATOMIC_LOAD_FMAX;
10768 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
10769 Ops,
M->getMemOperand());
10771 case Intrinsic::amdgcn_s_get_barrier_state:
10772 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10779 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10780 BarID = (BarID >> 4) & 0x3F;
10781 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10784 Ops.push_back(Chain);
10786 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10787 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10795 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
10803 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10804 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10805 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10809 EVT VT =
Op->getValueType(0);
10815 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10817 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10825SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
10832 EVT VT = VTList.
VTs[0];
10835 bool IsTFE = VTList.
NumVTs == 3;
10838 unsigned NumOpDWords = NumValueDWords + 1;
10840 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
10841 MachineMemOperand *OpDWordsMMO =
10843 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
10844 OpDWordsVT, OpDWordsMMO, DAG);
10849 NumValueDWords == 1
10858 if (!Subtarget->hasDwordx3LoadStores() &&
10859 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10863 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
10865 WidenedMemVT, WidenedMMO);
10875 bool ImageStore)
const {
10885 if (Subtarget->hasUnpackedD16VMem()) {
10899 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10910 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
10916 if ((NumElements % 2) == 1) {
10918 unsigned I = Elts.
size() / 2;
10934 if (NumElements == 3) {
10944 return DAG.
getNode(ISD::BITCAST,
DL, WidenedStoreVT, ZExt);
10955 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
10958 switch (IntrinsicID) {
10959 case Intrinsic::amdgcn_exp_compr: {
10960 if (!Subtarget->hasCompressedExport()) {
10963 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
10975 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src0),
10976 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src1),
10985 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10989 case Intrinsic::amdgcn_struct_tbuffer_store:
10990 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10992 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
10994 VData = handleD16VData(VData, DAG);
10995 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10996 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11014 M->getMemoryVT(),
M->getMemOperand());
11017 case Intrinsic::amdgcn_raw_tbuffer_store:
11018 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11020 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11022 VData = handleD16VData(VData, DAG);
11023 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11024 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11042 M->getMemoryVT(),
M->getMemOperand());
11045 case Intrinsic::amdgcn_raw_buffer_store:
11046 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11047 case Intrinsic::amdgcn_raw_buffer_store_format:
11048 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11049 const bool IsFormat =
11050 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11051 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11058 VData = handleD16VData(VData, DAG);
11068 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11069 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11089 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
11092 M->getMemoryVT(),
M->getMemOperand());
11095 case Intrinsic::amdgcn_struct_buffer_store:
11096 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11097 case Intrinsic::amdgcn_struct_buffer_store_format:
11098 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11099 const bool IsFormat =
11100 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11101 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11109 VData = handleD16VData(VData, DAG);
11119 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11120 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11139 EVT VDataType = VData.getValueType().getScalarType();
11141 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
11144 M->getMemoryVT(),
M->getMemOperand());
11146 case Intrinsic::amdgcn_raw_buffer_load_lds:
11147 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11148 case Intrinsic::amdgcn_struct_buffer_load_lds:
11149 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11150 if (!Subtarget->hasVMemToLDSLoad())
11154 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11155 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11156 unsigned OpOffset = HasVIndex ? 1 : 0;
11157 SDValue VOffset =
Op.getOperand(5 + OpOffset);
11159 unsigned Size =
Op->getConstantOperandVal(4);
11165 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11166 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11167 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11168 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11171 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11172 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11173 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11174 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11177 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11178 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11179 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11180 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11183 if (!Subtarget->hasLDSLoadB96_B128())
11185 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11186 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11187 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11188 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11191 if (!Subtarget->hasLDSLoadB96_B128())
11193 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11194 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11195 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11196 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11204 if (HasVIndex && HasVOffset)
11208 else if (HasVIndex)
11209 Ops.push_back(
Op.getOperand(5));
11210 else if (HasVOffset)
11211 Ops.push_back(VOffset);
11213 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11214 Ops.push_back(Rsrc);
11215 Ops.push_back(
Op.getOperand(6 + OpOffset));
11216 Ops.push_back(
Op.getOperand(7 + OpOffset));
11218 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
11231 MachineMemOperand *LoadMMO =
M->getMemOperand();
11236 MachinePointerInfo StorePtrI = LoadPtrI;
11260 case Intrinsic::amdgcn_load_to_lds:
11261 case Intrinsic::amdgcn_global_load_lds: {
11262 if (!Subtarget->hasVMemToLDSLoad())
11266 unsigned Size =
Op->getConstantOperandVal(4);
11271 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11274 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11277 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11280 if (!Subtarget->hasLDSLoadB96_B128())
11282 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11285 if (!Subtarget->hasLDSLoadB96_B128())
11287 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11303 if (
LHS->isDivergent())
11307 RHS.getOperand(0).getValueType() == MVT::i32) {
11310 VOffset =
RHS.getOperand(0);
11314 Ops.push_back(Addr);
11322 Ops.push_back(VOffset);
11325 Ops.push_back(
Op.getOperand(5));
11326 Ops.push_back(
Op.getOperand(6));
11331 MachineMemOperand *LoadMMO =
M->getMemOperand();
11333 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
11334 MachinePointerInfo StorePtrI = LoadPtrI;
11353 case Intrinsic::amdgcn_end_cf:
11355 Op->getOperand(2), Chain),
11357 case Intrinsic::amdgcn_s_barrier_init:
11358 case Intrinsic::amdgcn_s_barrier_signal_var: {
11365 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11366 ? AMDGPU::S_BARRIER_INIT_M0
11367 : AMDGPU::S_BARRIER_SIGNAL_M0;
11382 constexpr unsigned ShAmt = 16;
11389 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11394 case Intrinsic::amdgcn_s_barrier_join: {
11403 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11406 unsigned BarID = (BarVal >> 4) & 0x3F;
11409 Ops.push_back(Chain);
11411 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11421 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11427 case Intrinsic::amdgcn_s_prefetch_data: {
11430 return Op.getOperand(0);
11433 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11435 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
11442 Op->getVTList(),
Ops,
M->getMemoryVT(),
11443 M->getMemOperand());
11445 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11446 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11447 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11456 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11458 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11483std::pair<SDValue, SDValue>
11513 unsigned Overflow = ImmOffset & ~MaxImm;
11514 ImmOffset -= Overflow;
11515 if ((int32_t)Overflow < 0) {
11516 Overflow += ImmOffset;
11521 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
11540void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
11542 Align Alignment)
const {
11544 SDLoc
DL(CombinedOffset);
11546 uint32_t
Imm =
C->getZExtValue();
11547 uint32_t SOffset, ImmOffset;
11548 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11558 uint32_t SOffset, ImmOffset;
11561 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
11569 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11578SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
11581 return MaybePointer;
11595 SDValue NumRecords =
Op->getOperand(3);
11598 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11601 std::optional<uint32_t> ConstStride = std::nullopt;
11603 ConstStride = ConstNode->getZExtValue();
11606 if (!ConstStride || *ConstStride != 0) {
11609 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
11620 NewHighHalf, NumRecords, Flags);
11621 SDValue RsrcPtr = DAG.
getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11630 bool IsTFE)
const {
11639 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
11654 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
11658 LoadVal = DAG.
getNode(ISD::BITCAST,
DL, LoadVT, LoadVal);
11668 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11672 Ops[1] = BufferStoreExt;
11677 M->getMemOperand());
11702 DAGCombinerInfo &DCI)
const {
11703 SelectionDAG &DAG = DCI.DAG;
11718 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
11725 "unexpected vector extload");
11738 "unexpected fp extload");
11756 DCI.AddToWorklist(Cvt.
getNode());
11761 DCI.AddToWorklist(Cvt.
getNode());
11764 Cvt = DAG.
getNode(ISD::BITCAST, SL, VT, Cvt);
11772 if (
Info.isEntryFunction())
11773 return Info.getUserSGPRInfo().hasFlatScratchInit();
11781 EVT MemVT =
Load->getMemoryVT();
11782 MachineMemOperand *MMO =
Load->getMemOperand();
11794 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11822 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
11823 "Custom lowering for non-i32 vectors hasn't been implemented.");
11826 unsigned AS =
Load->getAddressSpace();
11833 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
11837 !Subtarget->hasMultiDwordFlatScratchAddressing())
11847 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
11850 Alignment >=
Align(4) && NumElements < 32) {
11852 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11864 if (NumElements > 4)
11867 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11877 switch (Subtarget->getMaxPrivateElementSize()) {
11883 if (NumElements > 2)
11888 if (NumElements > 4)
11891 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11900 auto Flags =
Load->getMemOperand()->getFlags();
11902 Load->getAlign(), Flags, &
Fast) &&
11911 MemVT, *
Load->getMemOperand())) {
11920 EVT VT =
Op.getValueType();
11947 return DAG.
getNode(ISD::BITCAST,
DL, VT, Res);
11957 EVT VT =
Op.getValueType();
11958 const SDNodeFlags
Flags =
Op->getFlags();
11960 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
11966 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11969 if (CLHS->isExactlyValue(1.0)) {
11986 if (CLHS->isExactlyValue(-1.0)) {
11995 if (!AllowInaccurateRcp &&
11996 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
12010 EVT VT =
Op.getValueType();
12011 const SDNodeFlags
Flags =
Op->getFlags();
12013 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
12014 if (!AllowInaccurateDiv)
12035 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
12049 return DAG.
getNode(Opcode, SL, VTList,
12058 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
12072 return DAG.
getNode(Opcode, SL, VTList,
12078 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12079 return FastLowered;
12082 EVT VT =
Op.getValueType();
12089 if (VT == MVT::bf16) {
12112 unsigned FMADOpCode =
12114 SDValue NegRHSExt = DAG.
getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12119 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12121 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
12122 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12128 Tmp = DAG.
getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12138 SDNodeFlags
Flags =
Op->getFlags();
12145 const APFloat K0Val(0x1p+96f);
12148 const APFloat K1Val(0x1p-32f);
12175 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
12176 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
12177 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12182 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12183 return FastLowered;
12189 SDNodeFlags
Flags =
Op->getFlags();
12190 Flags.setNoFPExcept(
true);
12198 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
12209 DAG.
getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12211 using namespace AMDGPU::Hwreg;
12212 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12216 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
12217 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
12220 const bool HasDynamicDenormals =
12226 if (!PreservesDenormals) {
12231 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12234 if (HasDynamicDenormals) {
12238 SavedDenormMode =
SDValue(GetReg, 0);
12244 SDNode *EnableDenorm;
12245 if (Subtarget->hasDenormModeInst()) {
12246 const SDValue EnableDenormValue =
12253 const SDValue EnableDenormValue =
12255 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12256 {EnableDenormValue,
BitField, Glue});
12266 ApproxRcp, One, NegDivScale0, Flags);
12269 ApproxRcp, Fma0, Flags);
12275 NumeratorScaled,
Mul, Flags);
12281 NumeratorScaled, Fma3, Flags);
12283 if (!PreservesDenormals) {
12284 SDNode *DisableDenorm;
12285 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12289 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12295 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
12296 const SDValue DisableDenormValue =
12297 HasDynamicDenormals
12302 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12313 {Fma4, Fma1, Fma3, Scale},
Flags);
12319 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
12320 return FastLowered;
12328 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
12332 SDValue NegDivScale0 = DAG.
getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12352 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12361 SDValue Scale0BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12362 SDValue Scale1BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12388 EVT VT =
Op.getValueType();
12390 if (VT == MVT::f32)
12391 return LowerFDIV32(
Op, DAG);
12393 if (VT == MVT::f64)
12394 return LowerFDIV64(
Op, DAG);
12396 if (VT == MVT::f16 || VT == MVT::bf16)
12397 return LowerFDIV16(
Op, DAG);
12406 EVT ResultExpVT =
Op->getValueType(1);
12407 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12417 if (Subtarget->hasFractBug()) {
12435 EVT VT =
Store->getMemoryVT();
12437 if (VT == MVT::i1) {
12441 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
12445 Store->getValue().getValueType().getScalarType() == MVT::i32);
12447 unsigned AS =
Store->getAddressSpace();
12455 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12459 !Subtarget->hasMultiDwordFlatScratchAddressing())
12466 if (NumElements > 4)
12469 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12473 VT, *
Store->getMemOperand()))
12479 switch (Subtarget->getMaxPrivateElementSize()) {
12483 if (NumElements > 2)
12487 if (NumElements > 4 ||
12488 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12496 auto Flags =
Store->getMemOperand()->getFlags();
12515 assert(!Subtarget->has16BitInsts());
12516 SDNodeFlags
Flags =
Op->getFlags();
12518 DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32,
Op.getOperand(0), Flags);
12530 SDNodeFlags
Flags =
Op->getFlags();
12531 MVT VT =
Op.getValueType().getSimpleVT();
12561 SDValue SqrtSNextDown = DAG.
getNode(ISD::BITCAST,
DL, VT, SqrtSNextDownInt);
12564 DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextDown, Flags);
12573 SDValue NegSqrtSNextUp = DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextUp, Flags);
12639 SDNodeFlags
Flags =
Op->getFlags();
12685 SqrtRet = DAG.
getNode(ISD::FLDEXP,
DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12702 EVT VT =
Op.getValueType();
12712 if (Subtarget->hasTrigReducedRange()) {
12719 switch (
Op.getOpcode()) {
12746 EVT VT =
Op.getValueType();
12754 Op->getVTList(),
Ops, VT,
12763SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
12764 DAGCombinerInfo &DCI)
const {
12765 EVT VT =
N->getValueType(0);
12767 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12770 SelectionDAG &DAG = DCI.DAG;
12774 EVT SrcVT = Src.getValueType();
12780 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12783 DCI.AddToWorklist(Cvt.
getNode());
12786 if (ScalarVT != MVT::f32) {
12798 DAGCombinerInfo &DCI)
const {
12805 if (SignOp.
getOpcode() == ISD::FP_EXTEND ||
12809 SelectionDAG &DAG = DCI.DAG;
12828 for (
unsigned I = 0;
I != NumElts; ++
I) {
12852 if (NewElts.
size() == 1)
12874 for (
unsigned I = 0;
I != NumElts; ++
I) {
12909SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
12911 DAGCombinerInfo &DCI)
const {
12929 SelectionDAG &DAG = DCI.DAG;
12942 AM.BaseOffs =
Offset.getSExtValue();
12947 EVT VT =
N->getValueType(0);
12953 Flags.setNoUnsignedWrap(
12954 N->getFlags().hasNoUnsignedWrap() &&
12964 switch (
N->getOpcode()) {
12975 DAGCombinerInfo &DCI)
const {
12976 SelectionDAG &DAG = DCI.DAG;
12983 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
12984 N->getMemoryVT(), DCI);
12988 NewOps[PtrIdx] = NewPtr;
12997 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
12998 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13007SDValue SITargetLowering::splitBinaryBitConstantOp(
13011 uint32_t ValLo =
Lo_32(Val);
13012 uint32_t ValHi =
Hi_32(Val);
13019 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
13033 if (V.getValueType() != MVT::i1)
13035 switch (V.getOpcode()) {
13052 return V.getResNo() == 1;
13054 unsigned IntrinsicID = V.getConstantOperandVal(0);
13055 switch (IntrinsicID) {
13056 case Intrinsic::amdgcn_is_shared:
13057 case Intrinsic::amdgcn_is_private:
13074 if (!(
C & 0x000000ff))
13075 ZeroByteMask |= 0x000000ff;
13076 if (!(
C & 0x0000ff00))
13077 ZeroByteMask |= 0x0000ff00;
13078 if (!(
C & 0x00ff0000))
13079 ZeroByteMask |= 0x00ff0000;
13080 if (!(
C & 0xff000000))
13081 ZeroByteMask |= 0xff000000;
13082 uint32_t NonZeroByteMask = ~ZeroByteMask;
13083 if ((NonZeroByteMask &
C) != NonZeroByteMask)
13096 assert(V.getValueSizeInBits() == 32);
13098 if (V.getNumOperands() != 2)
13107 switch (V.getOpcode()) {
13112 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13117 return (0x03020100 & ~ConstMask) | ConstMask;
13124 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
13130 return uint32_t(0x0c0c0c0c03020100ull >>
C);
13137 DAGCombinerInfo &DCI)
const {
13138 if (DCI.isBeforeLegalize())
13141 SelectionDAG &DAG = DCI.DAG;
13142 EVT VT =
N->getValueType(0);
13147 if (VT == MVT::i64 && CRHS) {
13149 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
13153 if (CRHS && VT == MVT::i32) {
13163 unsigned Shift = CShift->getZExtValue();
13165 unsigned Offset = NB + Shift;
13166 if ((
Offset & (Bits - 1)) == 0) {
13190 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13205 if (
Y.getOpcode() != ISD::FABS ||
Y.getOperand(0) !=
X ||
13210 if (
X !=
LHS.getOperand(1))
13214 const ConstantFPSDNode *C1 =
13248 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
13249 LHS.getOperand(0) ==
LHS.getOperand(1))) {
13251 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
13252 :
Mask->getZExtValue() & OrdMask;
13273 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13276 if (LHSMask != ~0u && RHSMask != ~0u) {
13279 if (LHSMask > RHSMask) {
13286 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13287 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13290 if (!(LHSUsedLanes & RHSUsedLanes) &&
13293 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13299 uint32_t
Mask = LHSMask & RHSMask;
13300 for (
unsigned I = 0;
I < 32;
I += 8) {
13301 uint32_t ByteSel = 0xff <<
I;
13302 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13303 Mask &= (0x0c <<
I) & 0xffffffff;
13308 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
13361static const std::optional<ByteProvider<SDValue>>
13363 unsigned Depth = 0) {
13366 return std::nullopt;
13368 if (
Op.getValueSizeInBits() < 8)
13369 return std::nullopt;
13371 if (
Op.getValueType().isVector())
13374 switch (
Op->getOpcode()) {
13386 NarrowVT = VTSign->getVT();
13389 return std::nullopt;
13392 if (SrcIndex >= NarrowByteWidth)
13393 return std::nullopt;
13401 return std::nullopt;
13403 uint64_t BitShift = ShiftOp->getZExtValue();
13405 if (BitShift % 8 != 0)
13406 return std::nullopt;
13408 SrcIndex += BitShift / 8;
13426static const std::optional<ByteProvider<SDValue>>
13428 unsigned StartingIndex = 0) {
13432 return std::nullopt;
13434 unsigned BitWidth =
Op.getScalarValueSizeInBits();
13436 return std::nullopt;
13438 return std::nullopt;
13440 bool IsVec =
Op.getValueType().isVector();
13441 switch (
Op.getOpcode()) {
13444 return std::nullopt;
13449 return std::nullopt;
13453 return std::nullopt;
13456 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
13457 return std::nullopt;
13458 if (!
LHS ||
LHS->isConstantZero())
13460 if (!
RHS ||
RHS->isConstantZero())
13462 return std::nullopt;
13467 return std::nullopt;
13471 return std::nullopt;
13473 uint32_t BitMask = BitMaskOp->getZExtValue();
13475 uint32_t IndexMask = 0xFF << (Index * 8);
13477 if ((IndexMask & BitMask) != IndexMask) {
13480 if (IndexMask & BitMask)
13481 return std::nullopt;
13490 return std::nullopt;
13494 if (!ShiftOp ||
Op.getValueType().isVector())
13495 return std::nullopt;
13497 uint64_t BitsProvided =
Op.getValueSizeInBits();
13498 if (BitsProvided % 8 != 0)
13499 return std::nullopt;
13501 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13503 return std::nullopt;
13505 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13506 uint64_t ByteShift = BitShift / 8;
13508 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13509 uint64_t BytesProvided = BitsProvided / 8;
13510 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13511 NewIndex %= BytesProvided;
13518 return std::nullopt;
13522 return std::nullopt;
13524 uint64_t BitShift = ShiftOp->getZExtValue();
13526 return std::nullopt;
13528 auto BitsProvided =
Op.getScalarValueSizeInBits();
13529 if (BitsProvided % 8 != 0)
13530 return std::nullopt;
13532 uint64_t BytesProvided = BitsProvided / 8;
13533 uint64_t ByteShift = BitShift / 8;
13538 return BytesProvided - ByteShift > Index
13546 return std::nullopt;
13550 return std::nullopt;
13552 uint64_t BitShift = ShiftOp->getZExtValue();
13553 if (BitShift % 8 != 0)
13554 return std::nullopt;
13555 uint64_t ByteShift = BitShift / 8;
13561 return Index < ByteShift
13564 Depth + 1, StartingIndex);
13573 return std::nullopt;
13581 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13583 if (NarrowBitWidth % 8 != 0)
13584 return std::nullopt;
13585 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13587 if (Index >= NarrowByteWidth)
13589 ? std::optional<ByteProvider<SDValue>>(
13597 return std::nullopt;
13601 if (NarrowByteWidth >= Index) {
13606 return std::nullopt;
13613 return std::nullopt;
13619 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13620 if (NarrowBitWidth % 8 != 0)
13621 return std::nullopt;
13622 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13627 if (Index >= NarrowByteWidth) {
13629 ? std::optional<ByteProvider<SDValue>>(
13634 if (NarrowByteWidth > Index) {
13638 return std::nullopt;
13643 return std::nullopt;
13646 Depth + 1, StartingIndex);
13652 return std::nullopt;
13653 auto VecIdx = IdxOp->getZExtValue();
13654 auto ScalarSize =
Op.getScalarValueSizeInBits();
13655 if (ScalarSize < 32)
13656 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13658 StartingIndex, Index);
13663 return std::nullopt;
13667 return std::nullopt;
13670 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13671 if (IdxMask > 0x07 && IdxMask != 0x0c)
13672 return std::nullopt;
13674 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13675 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13677 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
13683 return std::nullopt;
13698 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
13705 auto MemVT = L->getMemoryVT();
13708 return L->getMemoryVT().getSizeInBits() == 16;
13718 int Low8 = Mask & 0xff;
13719 int Hi8 = (Mask & 0xff00) >> 8;
13721 assert(Low8 < 8 && Hi8 < 8);
13723 bool IsConsecutive = (Hi8 - Low8 == 1);
13728 bool Is16Aligned = !(Low8 % 2);
13730 return IsConsecutive && Is16Aligned;
13738 int Low16 = PermMask & 0xffff;
13739 int Hi16 = (PermMask & 0xffff0000) >> 16;
13749 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13751 if (!OtherOpIs16Bit)
13759 unsigned DWordOffset) {
13764 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13769 if (Src.getValueType().isVector()) {
13770 auto ScalarTySize = Src.getScalarValueSizeInBits();
13771 auto ScalarTy = Src.getValueType().getScalarType();
13772 if (ScalarTySize == 32) {
13776 if (ScalarTySize > 32) {
13779 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13780 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13787 assert(ScalarTySize < 32);
13788 auto NumElements =
TypeSize / ScalarTySize;
13789 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13790 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13791 auto NumElementsIn32 = 32 / ScalarTySize;
13792 auto NumAvailElements = DWordOffset < Trunc32Elements
13794 : NumElements - NormalizedTrunc;
13807 auto ShiftVal = 32 * DWordOffset;
13815 [[maybe_unused]]
EVT VT =
N->getValueType(0);
13820 for (
int i = 0; i < 4; i++) {
13822 std::optional<ByteProvider<SDValue>>
P =
13825 if (!
P ||
P->isConstantZero())
13830 if (PermNodes.
size() != 4)
13833 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13834 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13836 for (
size_t i = 0; i < PermNodes.
size(); i++) {
13837 auto PermOp = PermNodes[i];
13840 int SrcByteAdjust = 4;
13844 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13845 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13847 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13848 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13852 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13853 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13856 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13858 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13861 SDValue Op = *PermNodes[FirstSrc.first].Src;
13863 assert(
Op.getValueSizeInBits() == 32);
13867 int Low16 = PermMask & 0xffff;
13868 int Hi16 = (PermMask & 0xffff0000) >> 16;
13870 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13871 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13874 if (WellFormedLow && WellFormedHi)
13878 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
13887 assert(
Op.getValueType().isByteSized() &&
13905 DAGCombinerInfo &DCI)
const {
13906 SelectionDAG &DAG = DCI.DAG;
13910 EVT VT =
N->getValueType(0);
13911 if (VT == MVT::i1) {
13916 if (Src !=
RHS.getOperand(0))
13921 if (!CLHS || !CRHS)
13925 static const uint32_t MaxMask = 0x3ff;
13945 Sel |=
LHS.getConstantOperandVal(2);
13954 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13958 auto usesCombinedOperand = [](SDNode *OrUse) {
13960 if (OrUse->getOpcode() != ISD::BITCAST ||
13961 !OrUse->getValueType(0).isVector())
13965 for (
auto *VUser : OrUse->users()) {
13966 if (!VUser->getValueType(0).isVector())
13973 if (VUser->getOpcode() == VectorwiseOp)
13979 if (!
any_of(
N->users(), usesCombinedOperand))
13985 if (LHSMask != ~0u && RHSMask != ~0u) {
13988 if (LHSMask > RHSMask) {
13995 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13996 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13999 if (!(LHSUsedLanes & RHSUsedLanes) &&
14002 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14004 LHSMask &= ~RHSUsedLanes;
14005 RHSMask &= ~LHSUsedLanes;
14007 LHSMask |= LHSUsedLanes & 0x04040404;
14009 uint32_t Sel = LHSMask | RHSMask;
14017 if (LHSMask == ~0u || RHSMask == ~0u) {
14058 return IdentitySrc;
14064 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14079 if (SrcVT == MVT::i32) {
14084 DCI.AddToWorklist(LowOr.
getNode());
14085 DCI.AddToWorklist(HiBits.getNode());
14089 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14096 N->getOperand(0), CRHS))
14104 DAGCombinerInfo &DCI)
const {
14105 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
14112 SelectionDAG &DAG = DCI.DAG;
14114 EVT VT =
N->getValueType(0);
14115 if (CRHS && VT == MVT::i64) {
14117 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
14124 unsigned Opc =
LHS.getOpcode();
14148 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(1));
14150 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(2));
14154 LHS->getOperand(0), FNegLHS, FNegRHS);
14155 return DAG.
getNode(ISD::BITCAST,
DL, VT, NewSelect);
14163 DAGCombinerInfo &DCI)
const {
14164 if (!Subtarget->has16BitInsts() ||
14168 EVT VT =
N->getValueType(0);
14169 if (VT != MVT::i32)
14173 if (Src.getValueType() != MVT::i16)
14180SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
14181 DAGCombinerInfo &DCI)
const {
14188 VTSign->getVT() == MVT::i8) ||
14190 VTSign->getVT() == MVT::i16))) {
14191 assert(Subtarget->hasScalarSubwordLoads() &&
14192 "s_buffer_load_{u8, i8} are supported "
14193 "in GFX12 (or newer) architectures.");
14194 EVT VT = Src.getValueType();
14199 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14206 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14207 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14212 VTSign->getVT() == MVT::i8) ||
14214 VTSign->getVT() == MVT::i16)) &&
14223 Src.getOperand(6), Src.getOperand(7)};
14226 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14230 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14231 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14232 return DCI.DAG.getMergeValues(
14233 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
14239 DAGCombinerInfo &DCI)
const {
14240 SelectionDAG &DAG = DCI.DAG;
14247 if (
N->getOperand(0).isUndef())
14254 DAGCombinerInfo &DCI)
const {
14255 EVT VT =
N->getValueType(0);
14270 if ((VT == MVT::f16 && N0.
getOpcode() == ISD::FSQRT) &&
14280 unsigned MaxDepth)
const {
14281 unsigned Opcode =
Op.getOpcode();
14286 const auto &
F = CFP->getValueAPF();
14287 if (
F.isNaN() &&
F.isSignaling())
14289 if (!
F.isDenormal())
14315 case ISD::FP_EXTEND:
14316 case ISD::FP16_TO_FP:
14317 case ISD::FP_TO_FP16:
14318 case ISD::BF16_TO_FP:
14319 case ISD::FP_TO_BF16:
14352 if (
Op.getValueType() == MVT::i32) {
14358 if (RHS->getZExtValue() == 0xffff0000) {
14368 return Op.getValueType().getScalarType() != MVT::f16;
14372 case ISD::FMINNUM_IEEE:
14373 case ISD::FMAXNUM_IEEE:
14374 case ISD::FMINIMUM:
14375 case ISD::FMAXIMUM:
14376 case ISD::FMINIMUMNUM:
14377 case ISD::FMAXIMUMNUM:
14389 if (Subtarget->supportsMinMaxDenormModes() ||
14399 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
14411 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
14438 if (
Op.getValueType() == MVT::i16) {
14441 TruncSrc.
getOpcode() == ISD::BITCAST &&
14449 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
14451 switch (IntrinsicID) {
14452 case Intrinsic::amdgcn_cvt_pkrtz:
14453 case Intrinsic::amdgcn_cubeid:
14454 case Intrinsic::amdgcn_frexp_mant:
14455 case Intrinsic::amdgcn_fdot2:
14456 case Intrinsic::amdgcn_rcp:
14457 case Intrinsic::amdgcn_rsq:
14458 case Intrinsic::amdgcn_rsq_clamp:
14459 case Intrinsic::amdgcn_rcp_legacy:
14460 case Intrinsic::amdgcn_rsq_legacy:
14461 case Intrinsic::amdgcn_trig_preop:
14462 case Intrinsic::amdgcn_tanh:
14463 case Intrinsic::amdgcn_log:
14464 case Intrinsic::amdgcn_exp2:
14465 case Intrinsic::amdgcn_sqrt:
14483 unsigned MaxDepth)
const {
14486 unsigned Opcode =
MI->getOpcode();
14488 if (Opcode == AMDGPU::G_FCANONICALIZE)
14491 std::optional<FPValueAndVReg> FCR;
14494 if (FCR->Value.isSignaling())
14496 if (!FCR->Value.isDenormal())
14507 case AMDGPU::G_FADD:
14508 case AMDGPU::G_FSUB:
14509 case AMDGPU::G_FMUL:
14510 case AMDGPU::G_FCEIL:
14511 case AMDGPU::G_FFLOOR:
14512 case AMDGPU::G_FRINT:
14513 case AMDGPU::G_FNEARBYINT:
14514 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14515 case AMDGPU::G_INTRINSIC_TRUNC:
14516 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14517 case AMDGPU::G_FMA:
14518 case AMDGPU::G_FMAD:
14519 case AMDGPU::G_FSQRT:
14520 case AMDGPU::G_FDIV:
14521 case AMDGPU::G_FREM:
14522 case AMDGPU::G_FPOW:
14523 case AMDGPU::G_FPEXT:
14524 case AMDGPU::G_FLOG:
14525 case AMDGPU::G_FLOG2:
14526 case AMDGPU::G_FLOG10:
14527 case AMDGPU::G_FPTRUNC:
14528 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14529 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14530 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14531 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14532 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14534 case AMDGPU::G_FNEG:
14535 case AMDGPU::G_FABS:
14536 case AMDGPU::G_FCOPYSIGN:
14538 case AMDGPU::G_FMINNUM:
14539 case AMDGPU::G_FMAXNUM:
14540 case AMDGPU::G_FMINNUM_IEEE:
14541 case AMDGPU::G_FMAXNUM_IEEE:
14542 case AMDGPU::G_FMINIMUM:
14543 case AMDGPU::G_FMAXIMUM:
14544 case AMDGPU::G_FMINIMUMNUM:
14545 case AMDGPU::G_FMAXIMUMNUM: {
14546 if (Subtarget->supportsMinMaxDenormModes() ||
14553 case AMDGPU::G_BUILD_VECTOR:
14558 case AMDGPU::G_INTRINSIC:
14559 case AMDGPU::G_INTRINSIC_CONVERGENT:
14561 case Intrinsic::amdgcn_fmul_legacy:
14562 case Intrinsic::amdgcn_fmad_ftz:
14563 case Intrinsic::amdgcn_sqrt:
14564 case Intrinsic::amdgcn_fmed3:
14565 case Intrinsic::amdgcn_sin:
14566 case Intrinsic::amdgcn_cos:
14567 case Intrinsic::amdgcn_log:
14568 case Intrinsic::amdgcn_exp2:
14569 case Intrinsic::amdgcn_log_clamp:
14570 case Intrinsic::amdgcn_rcp:
14571 case Intrinsic::amdgcn_rcp_legacy:
14572 case Intrinsic::amdgcn_rsq:
14573 case Intrinsic::amdgcn_rsq_clamp:
14574 case Intrinsic::amdgcn_rsq_legacy:
14575 case Intrinsic::amdgcn_div_scale:
14576 case Intrinsic::amdgcn_div_fmas:
14577 case Intrinsic::amdgcn_div_fixup:
14578 case Intrinsic::amdgcn_fract:
14579 case Intrinsic::amdgcn_cvt_pkrtz:
14580 case Intrinsic::amdgcn_cubeid:
14581 case Intrinsic::amdgcn_cubema:
14582 case Intrinsic::amdgcn_cubesc:
14583 case Intrinsic::amdgcn_cubetc:
14584 case Intrinsic::amdgcn_frexp_mant:
14585 case Intrinsic::amdgcn_fdot2:
14586 case Intrinsic::amdgcn_trig_preop:
14587 case Intrinsic::amdgcn_tanh:
14606 if (
C.isDenormal()) {
14620 if (
C.isSignaling()) {
14643SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
14644 DAGCombinerInfo &DCI)
const {
14645 SelectionDAG &DAG = DCI.DAG;
14647 EVT VT =
N->getValueType(0);
14656 EVT VT =
N->getValueType(0);
14657 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
14673 EVT EltVT =
Lo.getValueType();
14676 for (
unsigned I = 0;
I != 2; ++
I) {
14680 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14681 }
else if (
Op.isUndef()) {
14715 case ISD::FMAXNUM_IEEE:
14716 case ISD::FMAXIMUMNUM:
14718 case ISD::FMAXIMUM:
14725 case ISD::FMINNUM_IEEE:
14726 case ISD::FMINIMUMNUM:
14728 case ISD::FMINIMUM:
14754 if (!MinK || !MaxK)
14767 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14768 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14827 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
14833 if (
Info->getMode().DX10Clamp) {
14842 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14870 case ISD::FMINNUM_IEEE:
14871 case ISD::FMAXNUM_IEEE:
14872 case ISD::FMINIMUMNUM:
14873 case ISD::FMAXIMUMNUM:
14876 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
14878 case ISD::FMINIMUM:
14879 case ISD::FMAXIMUM:
14887 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
14896 DAGCombinerInfo &DCI)
const {
14897 SelectionDAG &DAG = DCI.DAG;
14929 if (
SDValue Med3 = performIntMed3ImmCombine(
14934 if (
SDValue Med3 = performIntMed3ImmCombine(
14940 if (
SDValue Med3 = performIntMed3ImmCombine(
14945 if (
SDValue Med3 = performIntMed3ImmCombine(
14955 if (((
Opc == ISD::FMINNUM && Op0.
getOpcode() == ISD::FMAXNUM) ||
14956 (
Opc == ISD::FMINNUM_IEEE && Op0.
getOpcode() == ISD::FMAXNUM_IEEE) ||
14957 (
Opc == ISD::FMINIMUMNUM && Op0.
getOpcode() == ISD::FMAXIMUMNUM) ||
14960 (VT == MVT::f32 || VT == MVT::f64 ||
14961 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14962 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14963 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14964 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14966 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1))
14973 const SDNodeFlags
Flags =
N->getFlags();
14974 if ((
Opc == ISD::FMINIMUM ||
Opc == ISD::FMAXIMUM) &&
14975 !Subtarget->hasIEEEMinimumMaximumInsts() &&
Flags.hasNoNaNs()) {
14977 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14978 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
14988 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
14989 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
14998 DAGCombinerInfo &DCI)
const {
14999 EVT VT =
N->getValueType(0);
15003 SelectionDAG &DAG = DCI.DAG;
15018 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15022 if (
Info->getMode().DX10Clamp) {
15042 DAGCombinerInfo &DCI)
const {
15046 return DCI.DAG.getUNDEF(
N->getValueType(0));
15054 bool IsDivergentIdx,
15059 unsigned VecSize = EltSize * NumElem;
15062 if (VecSize <= 64 && EltSize < 32)
15071 if (IsDivergentIdx)
15075 unsigned NumInsts = NumElem +
15076 ((EltSize + 31) / 32) * NumElem ;
15080 if (Subtarget->useVGPRIndexMode())
15081 return NumInsts <= 16;
15085 if (Subtarget->hasMovrel())
15086 return NumInsts <= 15;
15092 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
15107SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
15108 DAGCombinerInfo &DCI)
const {
15114 EVT ResVT =
N->getValueType(0);
15138 if (!
C ||
C->getZExtValue() != 0x1f)
15154 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15173 case ISD::FMAXNUM_IEEE:
15174 case ISD::FMINNUM_IEEE:
15175 case ISD::FMAXIMUM:
15176 case ISD::FMINIMUM: {
15182 DCI.AddToWorklist(Elt0.
getNode());
15183 DCI.AddToWorklist(Elt1.
getNode());
15205 if (!DCI.isBeforeLegalize())
15213 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15216 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15217 unsigned EltIdx = BitIndex / 32;
15218 unsigned LeftoverBitIdx = BitIndex % 32;
15222 DCI.AddToWorklist(Cast.
getNode());
15226 DCI.AddToWorklist(Elt.
getNode());
15229 DCI.AddToWorklist(Srl.
getNode());
15233 DCI.AddToWorklist(Trunc.
getNode());
15235 if (VecEltVT == ResVT) {
15236 return DAG.
getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15247SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
15248 DAGCombinerInfo &DCI)
const {
15259 SelectionDAG &DAG = DCI.DAG;
15278 if (Src.getOpcode() == ISD::FP_EXTEND &&
15279 Src.getOperand(0).getValueType() == MVT::f16) {
15280 return Src.getOperand(0);
15284 APFloat Val = CFP->getValueAPF();
15285 bool LosesInfo =
true;
15295 DAGCombinerInfo &DCI)
const {
15296 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15297 "combine only useful on gfx8");
15299 SDValue TruncSrc =
N->getOperand(0);
15300 EVT VT =
N->getValueType(0);
15301 if (VT != MVT::f16)
15308 SelectionDAG &DAG = DCI.DAG;
15336 return DAG.
getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15339unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
15341 const SDNode *N1)
const {
15346 if (((VT == MVT::f32 &&
15348 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15368 EVT VT =
N->getValueType(0);
15369 if (VT != MVT::i32 && VT != MVT::i64)
15375 unsigned Opc =
N->getOpcode();
15430 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
15449 DAGCombinerInfo &DCI)
const {
15452 SelectionDAG &DAG = DCI.DAG;
15453 EVT VT =
N->getValueType(0);
15463 if (!
N->isDivergent() && Subtarget->hasSMulHi())
15467 if (NumBits <= 32 || NumBits > 64)
15478 if (!Subtarget->hasFullRate64Ops()) {
15479 unsigned NumUsers = 0;
15480 for (SDNode *User :
LHS->
users()) {
15483 if (!
User->isAnyAdd())
15507 bool MulSignedLo =
false;
15508 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15517 if (VT != MVT::i64) {
15540 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15542 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15543 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15545 if (!MulLHSUnsigned32) {
15552 if (!MulRHSUnsigned32) {
15563 if (VT != MVT::i64)
15569SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
15570 DAGCombinerInfo &DCI)
const {
15580 SelectionDAG &DAG = DCI.DAG;
15595 unsigned Opcode =
N->getOpcode();
15596 if (Opcode == ISD::PTRADD)
15599 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
15610static std::optional<ByteProvider<SDValue>>
15613 if (!Byte0 || Byte0->isConstantZero()) {
15614 return std::nullopt;
15617 if (Byte1 && !Byte1->isConstantZero()) {
15618 return std::nullopt;
15624 unsigned FirstCs =
First & 0x0c0c0c0c;
15625 unsigned SecondCs = Second & 0x0c0c0c0c;
15626 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
15627 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15629 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15630 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15631 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15632 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15634 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15658 for (
int BPI = 0; BPI < 2; BPI++) {
15661 BPP = {Src1, Src0};
15663 unsigned ZeroMask = 0x0c0c0c0c;
15664 unsigned FMask = 0xFF << (8 * (3 - Step));
15666 unsigned FirstMask =
15667 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15668 unsigned SecondMask =
15669 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15673 int FirstGroup = -1;
15674 for (
int I = 0;
I < 2;
I++) {
15676 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
15677 return IterElt.SrcOp == *BPP.first.Src &&
15678 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15682 if (Match != Srcs.
end()) {
15683 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
15688 if (FirstGroup != -1) {
15690 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
15691 return IterElt.SrcOp == *BPP.second.Src &&
15692 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15695 if (Match != Srcs.
end()) {
15696 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
15698 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15706 unsigned ZeroMask = 0x0c0c0c0c;
15707 unsigned FMask = 0xFF << (8 * (3 - Step));
15711 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15715 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15724 if (Srcs.
size() == 1) {
15725 auto *Elt = Srcs.
begin();
15729 if (Elt->PermMask == 0x3020100)
15736 auto *FirstElt = Srcs.
begin();
15737 auto *SecondElt = std::next(FirstElt);
15744 auto FirstMask = FirstElt->PermMask;
15745 auto SecondMask = SecondElt->PermMask;
15747 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15748 unsigned FirstPlusFour = FirstMask | 0x04040404;
15751 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15763 FirstElt = std::next(SecondElt);
15764 if (FirstElt == Srcs.
end())
15767 SecondElt = std::next(FirstElt);
15770 if (SecondElt == Srcs.
end()) {
15776 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
15782 return Perms.
size() == 2
15788 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15789 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15790 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15791 EntryMask += ZeroMask;
15796 auto Opcode =
Op.getOpcode();
15802static std::optional<bool>
15813 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15816 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15818 assert(!(S0IsUnsigned && S0IsSigned));
15819 assert(!(S1IsUnsigned && S1IsSigned));
15827 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15833 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15834 return std::nullopt;
15846 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15847 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15852 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15858 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15859 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15860 return std::nullopt;
15866 DAGCombinerInfo &DCI)
const {
15867 SelectionDAG &DAG = DCI.DAG;
15868 EVT VT =
N->getValueType(0);
15874 if (Subtarget->hasMad64_32()) {
15875 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
15880 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
15884 if (VT == MVT::i64) {
15885 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15890 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15892 std::optional<bool> IsSigned;
15898 int ChainLength = 0;
15899 for (
int I = 0;
I < 4;
I++) {
15903 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15906 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15911 TempNode->getOperand(MulIdx), *Src0, *Src1,
15912 TempNode->getOperand(MulIdx)->getOperand(0),
15913 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15917 IsSigned = *IterIsSigned;
15918 if (*IterIsSigned != *IsSigned)
15921 auto AddIdx = 1 - MulIdx;
15924 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
15925 Src2s.
push_back(TempNode->getOperand(AddIdx));
15935 TempNode->getOperand(AddIdx), *Src0, *Src1,
15936 TempNode->getOperand(AddIdx)->getOperand(0),
15937 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15941 if (*IterIsSigned != *IsSigned)
15945 ChainLength =
I + 2;
15949 TempNode = TempNode->getOperand(AddIdx);
15951 ChainLength =
I + 1;
15952 if (TempNode->getNumOperands() < 2)
15954 LHS = TempNode->getOperand(0);
15955 RHS = TempNode->getOperand(1);
15958 if (ChainLength < 2)
15964 if (ChainLength < 4) {
15974 bool UseOriginalSrc =
false;
15975 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
15976 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
15977 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
15978 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
15979 SmallVector<unsigned, 4> SrcBytes;
15980 auto Src0Mask = Src0s.
begin()->PermMask;
15981 SrcBytes.
push_back(Src0Mask & 0xFF000000);
15982 bool UniqueEntries =
true;
15983 for (
auto I = 1;
I < 4;
I++) {
15984 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
15987 UniqueEntries =
false;
15993 if (UniqueEntries) {
15994 UseOriginalSrc =
true;
15996 auto *FirstElt = Src0s.
begin();
16000 auto *SecondElt = Src1s.
begin();
16002 SecondElt->DWordOffset);
16011 if (!UseOriginalSrc) {
16018 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16021 : Intrinsic::amdgcn_udot4,
16031 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16036 unsigned Opc =
LHS.getOpcode();
16048 auto Cond =
RHS.getOperand(0);
16053 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16070 DAGCombinerInfo &DCI)
const {
16071 SelectionDAG &DAG = DCI.DAG;
16073 EVT VT =
N->getValueType(0);
16086 SDNodeFlags ShlFlags = N1->
getFlags();
16090 SDNodeFlags NewShlFlags =
16095 DCI.AddToWorklist(Inner.
getNode());
16102 if (Subtarget->hasMad64_32()) {
16103 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16112 if (VT == MVT::i64) {
16113 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16121 if (
const GlobalAddressSDNode *GA =
16126 SDNodeFlags
Flags =
16129 DCI.AddToWorklist(Inner.
getNode());
16157 SDNodeFlags ReassocFlags =
16160 if (ZIsConstant != YIsConstant) {
16164 DCI.AddToWorklist(Inner.
getNode());
16172 assert(!YIsConstant && !ZIsConstant);
16174 if (!
X->isDivergent() &&
Y->isDivergent() !=
Z->isDivergent()) {
16183 if (
Y->isDivergent())
16186 DCI.AddToWorklist(UniformInner.
getNode());
16194 DAGCombinerInfo &DCI)
const {
16195 SelectionDAG &DAG = DCI.DAG;
16196 EVT VT =
N->getValueType(0);
16198 if (VT == MVT::i64) {
16199 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16203 if (VT != MVT::i32)
16212 unsigned Opc =
RHS.getOpcode();
16219 auto Cond =
RHS.getOperand(0);
16224 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16242SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
16243 DAGCombinerInfo &DCI)
const {
16245 if (
N->getValueType(0) != MVT::i32)
16251 SelectionDAG &DAG = DCI.DAG;
16256 unsigned LHSOpc =
LHS.getOpcode();
16257 unsigned Opc =
N->getOpcode();
16261 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
16267 DAGCombinerInfo &DCI)
const {
16271 SelectionDAG &DAG = DCI.DAG;
16272 EVT VT =
N->getValueType(0);
16284 if (
A ==
LHS.getOperand(1)) {
16285 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16286 if (FusedOp != 0) {
16288 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
16296 if (
A ==
RHS.getOperand(1)) {
16297 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16298 if (FusedOp != 0) {
16300 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
16309 DAGCombinerInfo &DCI)
const {
16313 SelectionDAG &DAG = DCI.DAG;
16315 EVT VT =
N->getValueType(0);
16328 if (
A ==
LHS.getOperand(1)) {
16329 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16330 if (FusedOp != 0) {
16334 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
16343 if (
A ==
RHS.getOperand(1)) {
16344 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16345 if (FusedOp != 0) {
16347 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
16356 DAGCombinerInfo &DCI)
const {
16357 SelectionDAG &DAG = DCI.DAG;
16359 EVT VT =
N->getValueType(0);
16360 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16366 SDNodeFlags
Flags =
N->getFlags();
16367 SDNodeFlags RHSFlags =
RHS->getFlags();
16373 bool IsNegative =
false;
16374 if (CLHS->isExactlyValue(1.0) ||
16375 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16378 if (
RHS.getOpcode() == ISD::FSQRT) {
16382 return IsNegative ? DAG.
getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16391 DAGCombinerInfo &DCI)
const {
16392 SelectionDAG &DAG = DCI.DAG;
16393 EVT VT =
N->getValueType(0);
16397 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
16398 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16413 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16418 const ConstantFPSDNode *FalseNode =
16428 if (ScalarVT == MVT::f32 &&
16434 if (TrueNodeExpVal == INT_MIN)
16437 if (FalseNodeExpVal == INT_MIN)
16450 return DAG.
getNode(ISD::FLDEXP, SL, VT,
LHS, SelectNode,
N->getFlags());
16457 DAGCombinerInfo &DCI)
const {
16458 SelectionDAG &DAG = DCI.DAG;
16459 EVT VT =
N->getValueType(0);
16462 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16480 (
N->getFlags().hasAllowContract() &&
16481 FMA->getFlags().hasAllowContract())) {
16496 if (FMAOp1.
getOpcode() != ISD::FP_EXTEND ||
16515 if (Vec1 == Vec2 || Vec3 == Vec4)
16521 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16530 DAGCombinerInfo &DCI)
const {
16531 SelectionDAG &DAG = DCI.DAG;
16536 EVT VT =
LHS.getValueType();
16565 return LHS.getOperand(0);
16573 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
16580 const APInt &CT =
LHS.getConstantOperandAPInt(1);
16581 const APInt &CF =
LHS.getConstantOperandAPInt(2);
16589 return LHS.getOperand(0);
16593 if (VT != MVT::f32 && VT != MVT::f64 &&
16594 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16602 LHS.getOpcode() == ISD::FABS) {
16609 const unsigned IsInfMask =
16611 const unsigned IsFiniteMask =
16625SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
16626 DAGCombinerInfo &DCI)
const {
16627 SelectionDAG &DAG = DCI.DAG;
16648 unsigned ShiftOffset = 8 *
Offset;
16650 ShiftOffset -=
C->getZExtValue();
16652 ShiftOffset +=
C->getZExtValue();
16654 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16656 MVT::f32, Shifted);
16667 DCI.AddToWorklist(
N);
16674 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
16680 DAGCombinerInfo &DCI)
const {
16685 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16689 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16690 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
16693 APFloat One(
F.getSemantics(),
"1.0");
16695 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
16701 DAGCombinerInfo &DCI)
const {
16722 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
16723 bool isInteger =
LHS.getValueType().isInteger();
16726 if (!isFloatingPoint && !isInteger)
16731 if (!isEquality && !isNonEquality)
16748 if (isFloatingPoint) {
16750 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16761 if (!(isEquality && TrueVal == ConstVal) &&
16762 !(isNonEquality && FalseVal == ConstVal))
16769 SelectLHS, SelectRHS);
16774 switch (
N->getOpcode()) {
16790 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
16800 switch (
N->getOpcode()) {
16802 return performAddCombine(
N, DCI);
16804 return performPtrAddCombine(
N, DCI);
16806 return performSubCombine(
N, DCI);
16809 return performAddCarrySubCarryCombine(
N, DCI);
16811 return performFAddCombine(
N, DCI);
16813 return performFSubCombine(
N, DCI);
16815 return performFDivCombine(
N, DCI);
16817 return performFMulCombine(
N, DCI);
16819 return performSetCCCombine(
N, DCI);
16821 if (
auto Res = performSelectCombine(
N, DCI))
16826 case ISD::FMAXNUM_IEEE:
16827 case ISD::FMINNUM_IEEE:
16828 case ISD::FMAXIMUM:
16829 case ISD::FMINIMUM:
16830 case ISD::FMAXIMUMNUM:
16831 case ISD::FMINIMUMNUM:
16838 return performMinMaxCombine(
N, DCI);
16840 return performFMACombine(
N, DCI);
16842 return performAndCombine(
N, DCI);
16844 return performOrCombine(
N, DCI);
16847 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
16848 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16854 return performXorCombine(
N, DCI);
16856 return performZeroExtendCombine(
N, DCI);
16858 return performSignExtendInRegCombine(
N, DCI);
16860 return performClassCombine(
N, DCI);
16862 return performFCanonicalizeCombine(
N, DCI);
16864 return performRcpCombine(
N, DCI);
16879 return performUCharToFloatCombine(
N, DCI);
16881 return performFCopySignCombine(
N, DCI);
16886 return performCvtF32UByteNCombine(
N, DCI);
16888 return performFMed3Combine(
N, DCI);
16890 return performCvtPkRTZCombine(
N, DCI);
16892 return performClampCombine(
N, DCI);
16895 EVT VT =
N->getValueType(0);
16898 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16901 EVT EltVT = Src.getValueType();
16902 if (EltVT != MVT::i16)
16903 Src = DAG.
getNode(ISD::BITCAST, SL, MVT::i16, Src);
16906 return DAG.
getNode(ISD::BITCAST, SL, VT, Ext);
16912 return performExtractVectorEltCombine(
N, DCI);
16914 return performInsertVectorEltCombine(
N, DCI);
16916 return performFPRoundCombine(
N, DCI);
16925 return performMemSDNodeCombine(MemNode, DCI);
16956 unsigned Opcode =
Node->getMachineOpcode();
16959 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16960 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
16963 SDNode *
Users[5] = {
nullptr};
16965 unsigned DmaskIdx =
16966 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16967 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
16968 unsigned NewDmask = 0;
16969 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16970 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16971 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
16972 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
16973 unsigned TFCLane = 0;
16974 bool HasChain =
Node->getNumValues() > 1;
16976 if (OldDmask == 0) {
16984 TFCLane = OldBitsSet;
16988 for (SDUse &Use :
Node->uses()) {
16991 if (
Use.getResNo() != 0)
16994 SDNode *
User =
Use.getUser();
16997 if (!
User->isMachineOpcode() ||
16998 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17010 if (UsesTFC && Lane == TFCLane) {
17015 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17017 Dmask &= ~(1 << Comp);
17025 NewDmask |= 1 << Comp;
17030 bool NoChannels = !NewDmask;
17037 if (OldBitsSet == 1)
17043 if (NewDmask == OldDmask)
17052 unsigned NewChannels = BitsSet + UsesTFC;
17056 assert(NewOpcode != -1 &&
17057 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
17058 "failed to find equivalent MIMG op");
17066 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
17068 MVT ResultVT = NewChannels == 1
17071 : NewChannels == 5 ? 8
17073 SDVTList NewVTList =
17076 MachineSDNode *NewNode =
17085 if (NewChannels == 1) {
17095 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17100 if (i || !NoChannels)
17105 if (NewUser != User) {
17115 Idx = AMDGPU::sub1;
17118 Idx = AMDGPU::sub2;
17121 Idx = AMDGPU::sub3;
17124 Idx = AMDGPU::sub4;
17135 Op =
Op.getOperand(0);
17156 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17160 Node->getOperand(0), SL, VReg, SrcVal,
17166 return ToResultReg.
getNode();
17171 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
17173 Ops.push_back(
Node->getOperand(i));
17179 Node->getOperand(i).getValueType(),
17180 Node->getOperand(i)),
17192 unsigned Opcode =
Node->getMachineOpcode();
17194 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
17195 !
TII->isGather4(Opcode) &&
17197 return adjustWritemask(
Node, DAG);
17200 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17206 case AMDGPU::V_DIV_SCALE_F32_e64:
17207 case AMDGPU::V_DIV_SCALE_F64_e64: {
17217 (Src0 == Src1 || Src0 == Src2))
17273 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
17274 unsigned InitIdx = 0;
17276 if (
TII->isImage(
MI)) {
17284 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
17285 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
17286 unsigned D16Val = D16 ? D16->getImm() : 0;
17288 if (!TFEVal && !LWEVal)
17299 assert(MO_Dmask &&
"Expected dmask operand in instruction");
17301 unsigned dmask = MO_Dmask->
getImm();
17306 bool Packed = !Subtarget->hasUnpackedD16VMem();
17308 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17314 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
17315 if (DstSize < InitIdx)
17318 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
17326 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
17327 unsigned NewDst = 0;
17332 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17333 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17336 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17337 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
17357 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
17370 if (
TII->isVOP3(
MI.getOpcode())) {
17372 TII->legalizeOperandsVOP3(
MRI,
MI);
17377 if (!
MI.getDesc().operands().empty()) {
17378 unsigned Opc =
MI.getOpcode();
17379 bool HasAGPRs = Info->mayNeedAGPRs();
17381 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
17383 {AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src0),
17384 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1), Src2Idx}) {
17387 if ((
I == Src2Idx) && (HasAGPRs))
17390 if (!
Op.isReg() || !
Op.getReg().isVirtual())
17392 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
17393 if (!
TRI->hasAGPRs(RC))
17395 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
17396 if (!Src || !Src->isCopy() ||
17397 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
17399 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
17403 MRI.setRegClass(
Op.getReg(), NewRC);
17406 if (
TII->isMAI(
MI)) {
17411 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17412 AMDGPU::OpName::scale_src0);
17413 if (Src0Idx != -1) {
17414 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17415 AMDGPU::OpName::scale_src1);
17416 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
17417 TII->usesConstantBus(
MRI,
MI, Src1Idx))
17418 TII->legalizeOpWithMove(
MI, Src1Idx);
17426 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
17427 if (Src2->isReg() && Src2->getReg().isVirtual()) {
17428 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
17429 if (
TRI->isVectorSuperClass(RC)) {
17430 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
17431 MRI.setRegClass(Src2->getReg(), NewRC);
17432 if (Src2->isTied())
17433 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
17442 if (
TII->isImage(
MI))
17443 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
17517std::pair<unsigned, const TargetRegisterClass *>
17524 if (Constraint.
size() == 1) {
17528 if (VT == MVT::Other)
17531 switch (Constraint[0]) {
17538 RC = &AMDGPU::SReg_32RegClass;
17541 RC = &AMDGPU::SGPR_64RegClass;
17546 return std::pair(0U,
nullptr);
17553 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17554 : &AMDGPU::VGPR_32_Lo256RegClass;
17557 RC = Subtarget->has1024AddressableVGPRs()
17558 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
17561 return std::pair(0U,
nullptr);
17566 if (!Subtarget->hasMAIInsts())
17570 RC = &AMDGPU::AGPR_32RegClass;
17575 return std::pair(0U,
nullptr);
17580 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
17584 RC = &AMDGPU::AV_32RegClass;
17587 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
17589 return std::pair(0U,
nullptr);
17598 return std::pair(0U, RC);
17601 if (Kind !=
'\0') {
17603 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17604 }
else if (Kind ==
's') {
17605 RC = &AMDGPU::SGPR_32RegClass;
17606 }
else if (Kind ==
'a') {
17607 RC = &AMDGPU::AGPR_32RegClass;
17613 return std::pair(0U,
nullptr);
17619 return std::pair(0U,
nullptr);
17623 RC =
TRI->getVGPRClassForBitWidth(Width);
17625 RC =
TRI->getSGPRClassForBitWidth(Width);
17627 RC =
TRI->getAGPRClassForBitWidth(Width);
17629 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17634 return std::pair(0U,
nullptr);
17636 return std::pair(Reg, RC);
17642 return std::pair(0U,
nullptr);
17643 if (Idx < RC->getNumRegs())
17645 return std::pair(0U,
nullptr);
17651 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
17657 if (Constraint.
size() == 1) {
17658 switch (Constraint[0]) {
17668 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
17676 if (Constraint.
size() == 1) {
17677 switch (Constraint[0]) {
17685 }
else if (Constraint.
size() == 2) {
17686 if (Constraint ==
"VA")
17704 std::vector<SDValue> &
Ops,
17719 unsigned Size =
Op.getScalarValueSizeInBits();
17723 if (
Size == 16 && !Subtarget->has16BitInsts())
17727 Val =
C->getSExtValue();
17731 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17735 if (
Size != 16 ||
Op.getNumOperands() != 2)
17737 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
17740 Val =
C->getSExtValue();
17744 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17754 if (Constraint.
size() == 1) {
17755 switch (Constraint[0]) {
17770 }
else if (Constraint.
size() == 2) {
17771 if (Constraint ==
"DA") {
17772 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
17773 int64_t LoBits =
static_cast<int32_t
>(Val);
17777 if (Constraint ==
"DB") {
17785 unsigned MaxSize)
const {
17786 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
17787 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17789 MVT VT =
Op.getSimpleValueType();
17814 switch (UnalignedClassID) {
17815 case AMDGPU::VReg_64RegClassID:
17816 return AMDGPU::VReg_64_Align2RegClassID;
17817 case AMDGPU::VReg_96RegClassID:
17818 return AMDGPU::VReg_96_Align2RegClassID;
17819 case AMDGPU::VReg_128RegClassID:
17820 return AMDGPU::VReg_128_Align2RegClassID;
17821 case AMDGPU::VReg_160RegClassID:
17822 return AMDGPU::VReg_160_Align2RegClassID;
17823 case AMDGPU::VReg_192RegClassID:
17824 return AMDGPU::VReg_192_Align2RegClassID;
17825 case AMDGPU::VReg_224RegClassID:
17826 return AMDGPU::VReg_224_Align2RegClassID;
17827 case AMDGPU::VReg_256RegClassID:
17828 return AMDGPU::VReg_256_Align2RegClassID;
17829 case AMDGPU::VReg_288RegClassID:
17830 return AMDGPU::VReg_288_Align2RegClassID;
17831 case AMDGPU::VReg_320RegClassID:
17832 return AMDGPU::VReg_320_Align2RegClassID;
17833 case AMDGPU::VReg_352RegClassID:
17834 return AMDGPU::VReg_352_Align2RegClassID;
17835 case AMDGPU::VReg_384RegClassID:
17836 return AMDGPU::VReg_384_Align2RegClassID;
17837 case AMDGPU::VReg_512RegClassID:
17838 return AMDGPU::VReg_512_Align2RegClassID;
17839 case AMDGPU::VReg_1024RegClassID:
17840 return AMDGPU::VReg_1024_Align2RegClassID;
17841 case AMDGPU::AReg_64RegClassID:
17842 return AMDGPU::AReg_64_Align2RegClassID;
17843 case AMDGPU::AReg_96RegClassID:
17844 return AMDGPU::AReg_96_Align2RegClassID;
17845 case AMDGPU::AReg_128RegClassID:
17846 return AMDGPU::AReg_128_Align2RegClassID;
17847 case AMDGPU::AReg_160RegClassID:
17848 return AMDGPU::AReg_160_Align2RegClassID;
17849 case AMDGPU::AReg_192RegClassID:
17850 return AMDGPU::AReg_192_Align2RegClassID;
17851 case AMDGPU::AReg_256RegClassID:
17852 return AMDGPU::AReg_256_Align2RegClassID;
17853 case AMDGPU::AReg_512RegClassID:
17854 return AMDGPU::AReg_512_Align2RegClassID;
17855 case AMDGPU::AReg_1024RegClassID:
17856 return AMDGPU::AReg_1024_Align2RegClassID;
17872 if (Info->isEntryFunction()) {
17879 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17881 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17882 :
TRI->getAlignedHighSGPRForRC(MF, 2,
17883 &AMDGPU::SGPR_64RegClass);
17884 Info->setSGPRForEXECCopy(SReg);
17886 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
17887 Info->getStackPtrOffsetReg()));
17888 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17889 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17893 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17894 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17896 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17897 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17899 Info->limitOccupancy(MF);
17901 if (ST.isWave32() && !MF.
empty()) {
17902 for (
auto &
MBB : MF) {
17903 for (
auto &
MI :
MBB) {
17904 TII->fixImplicitOperands(
MI);
17914 if (ST.needsAlignedVGPRs()) {
17915 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
17921 if (NewClassID != -1)
17922 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
17931 const APInt &DemandedElts,
17933 unsigned Depth)
const {
17935 unsigned Opc =
Op.getOpcode();
17938 unsigned IID =
Op.getConstantOperandVal(0);
17940 case Intrinsic::amdgcn_mbcnt_lo:
17941 case Intrinsic::amdgcn_mbcnt_hi: {
17947 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17957 Op, Known, DemandedElts, DAG,
Depth);
17973 unsigned MaxValue =
17980 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
17984 unsigned Src1Cst = 0;
17985 if (Src1.
isImm()) {
17986 Src1Cst = Src1.
getImm();
17987 }
else if (Src1.
isReg()) {
17991 Src1Cst = Cst->Value.getZExtValue();
18002 if (Width >= BFEWidth)
18011 Known = Known.
sext(BFEWidth);
18013 Known = Known.
zext(BFEWidth);
18019 unsigned Depth)
const {
18022 switch (
MI->getOpcode()) {
18023 case AMDGPU::S_BFE_I32:
18026 case AMDGPU::S_BFE_U32:
18029 case AMDGPU::S_BFE_I64:
18032 case AMDGPU::S_BFE_U64:
18035 case AMDGPU::G_INTRINSIC:
18036 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18039 case Intrinsic::amdgcn_workitem_id_x:
18042 case Intrinsic::amdgcn_workitem_id_y:
18045 case Intrinsic::amdgcn_workitem_id_z:
18048 case Intrinsic::amdgcn_mbcnt_lo:
18049 case Intrinsic::amdgcn_mbcnt_hi: {
18061 case Intrinsic::amdgcn_groupstaticsize: {
18072 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18075 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18078 case AMDGPU::G_AMDGPU_SMED3:
18079 case AMDGPU::G_AMDGPU_UMED3: {
18080 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
18107 unsigned Depth)
const {
18114 AttributeList Attrs =
18116 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
18143 if (Header->getAlignment() != PrefAlign)
18144 return Header->getAlignment();
18146 unsigned LoopSize = 0;
18151 LoopSize +=
MBB->getAlignment().value() / 2;
18154 LoopSize +=
TII->getInstSizeInBytes(
MI);
18155 if (LoopSize > 192)
18160 if (LoopSize <= 64)
18163 if (LoopSize <= 128)
18164 return CacheLineAlign;
18170 auto I = Exit->getFirstNonDebugInstr();
18171 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18172 return CacheLineAlign;
18181 if (PreTerm == Pre->
begin() ||
18182 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18186 auto ExitHead = Exit->getFirstNonDebugInstr();
18187 if (ExitHead == Exit->end() ||
18188 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18193 return CacheLineAlign;
18201 N =
N->getOperand(0).getNode();
18202 if (
N->getOpcode() == ISD::INLINEASM ||
N->getOpcode() == ISD::INLINEASM_BR)
18211 switch (
N->getOpcode()) {
18219 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
18220 return !
TRI->isSGPRReg(
MRI, Reg);
18226 return !
TRI->isSGPRReg(
MRI, Reg);
18230 unsigned AS = L->getAddressSpace();
18234 case ISD::CALLSEQ_END:
18263 return A->readMem() &&
A->writeMem();
18284 switch (Ty.getScalarSizeInBits()) {
18296 const APInt &DemandedElts,
18299 unsigned Depth)
const {
18304 if (Info->getMode().DX10Clamp)
18316 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
18336 <<
"Hardware instruction generated for atomic "
18338 <<
" operation at memory scope " << MemScope;
18343 Type *EltTy = VT->getElementType();
18344 return VT->getNumElements() == 2 &&
18364 unsigned BW =
IT->getBitWidth();
18365 return BW == 32 || BW == 64;
18379 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
18380 return BW == 32 || BW == 64;
18383 if (Ty->isFloatTy() || Ty->isDoubleTy())
18387 return VT->getNumElements() == 2 &&
18388 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18398 bool HasSystemScope) {
18405 if (HasSystemScope) {
18414 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
18427 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
18453 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
18466 bool HasSystemScope =
18492 if (Subtarget->hasEmulatedSystemScopeAtomics())
18508 if (!HasSystemScope &&
18509 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18521 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
18529 ConstVal && ConstVal->isNullValue())
18567 if (Ty->isFloatTy()) {
18572 if (Ty->isDoubleTy()) {
18593 if (Ty->isFloatTy() &&
18594 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18607 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18611 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
18615 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18620 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
18625 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18629 if (Ty->isFloatTy()) {
18632 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18635 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18640 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18648 if (Subtarget->hasFlatAtomicFaddF32Inst())
18657 if (Subtarget->hasLDSFPAtomicAddF32()) {
18658 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18660 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18688 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18690 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18694 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18696 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18749 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18750 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18751 : &AMDGPU::SReg_32RegClass;
18752 if (!
TRI->isSGPRClass(RC) && !isDivergent)
18753 return TRI->getEquivalentSGPRClass(RC);
18754 if (
TRI->isSGPRClass(RC) && isDivergent)
18755 return TRI->getEquivalentVGPRClass(RC);
18767 unsigned WaveSize) {
18772 if (!
IT ||
IT->getBitWidth() != WaveSize)
18777 if (!Visited.
insert(V).second)
18779 bool Result =
false;
18780 for (
const auto *U : V->users()) {
18782 if (V == U->getOperand(1)) {
18787 case Intrinsic::amdgcn_if_break:
18788 case Intrinsic::amdgcn_if:
18789 case Intrinsic::amdgcn_else:
18794 if (V == U->getOperand(0)) {
18799 case Intrinsic::amdgcn_end_cf:
18800 case Intrinsic::amdgcn_loop:
18806 Result =
hasCFUser(U, Visited, WaveSize);
18815 const Value *V)
const {
18817 if (CI->isInlineAsm()) {
18826 for (
auto &TC : TargetConstraints) {
18840 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18868 return MRI.hasOneNonDBGUse(N0);
18875 if (
I.getMetadata(
"amdgpu.noclobber"))
18877 if (
I.getMetadata(
"amdgpu.last.use"))
18887 if (!Def->isMachineOpcode())
18897 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18898 PhysReg = AMDGPU::SCC;
18900 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18966 Alignment = RMW->getAlign();
18979 bool FullFlatEmulation =
18981 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18982 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18983 RMW->getType()->isDoubleTy()));
18986 bool ReturnValueIsUsed = !AI->
use_empty();
18995 if (FullFlatEmulation) {
19006 std::prev(BB->
end())->eraseFromParent();
19007 Builder.SetInsertPoint(BB);
19009 Value *LoadedShared =
nullptr;
19010 if (FullFlatEmulation) {
19011 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19012 {Addr},
nullptr,
"is.shared");
19013 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19014 Builder.SetInsertPoint(SharedBB);
19015 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19021 LoadedShared = Clone;
19023 Builder.CreateBr(PhiBB);
19024 Builder.SetInsertPoint(CheckPrivateBB);
19027 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19028 {Addr},
nullptr,
"is.private");
19029 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19031 Builder.SetInsertPoint(PrivateBB);
19033 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19036 Value *LoadedPrivate;
19038 LoadedPrivate = Builder.CreateAlignedLoad(
19039 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
19042 LoadedPrivate, RMW->getValOperand());
19044 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19046 auto [ResultLoad, Equal] =
19052 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19055 Builder.CreateBr(PhiBB);
19057 Builder.SetInsertPoint(GlobalBB);
19061 if (FullFlatEmulation) {
19062 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19071 if (!FullFlatEmulation) {
19076 MDNode *RangeNotPrivate =
19079 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
19083 Builder.CreateBr(PhiBB);
19085 Builder.SetInsertPoint(PhiBB);
19087 if (ReturnValueIsUsed) {
19090 if (FullFlatEmulation)
19097 Builder.CreateBr(ExitBB);
19101 unsigned PtrOpIdx) {
19102 Value *PtrOp =
I->getOperand(PtrOpIdx);
19109 I->setOperand(PtrOpIdx, ASCast);
19121 ConstVal && ConstVal->isNullValue()) {
19151 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19159 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19174 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned AndSaveExecOpc
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ TC_RETURN_GFX_WholeWave
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ SMULO
Same for multiplication.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
int popcount(T Value) noexcept
Count the number of set bits in a value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const