44#include "llvm/IR/IntrinsicsAMDGPU.h"
45#include "llvm/IR/IntrinsicsR600.h"
56#define DEBUG_TYPE "si-lower"
62 cl::desc(
"Do not align and prefetch loops"),
66 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
67 cl::desc(
"Use indirect register addressing for divergent indexes"),
81 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
82 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
84 return AMDGPU::SGPR0 +
Reg;
100 TRI->getDefaultVectorSuperClassForBitWidth(32);
106 TRI->getDefaultVectorSuperClassForBitWidth(64);
144 TRI->getDefaultVectorSuperClassForBitWidth(320));
148 TRI->getDefaultVectorSuperClassForBitWidth(352));
152 TRI->getDefaultVectorSuperClassForBitWidth(384));
156 TRI->getDefaultVectorSuperClassForBitWidth(512));
163 TRI->getDefaultVectorSuperClassForBitWidth(1024));
165 if (Subtarget->has16BitInsts()) {
166 if (Subtarget->useRealTrue16Insts()) {
196 TRI->getDefaultVectorSuperClassForBitWidth(1024));
209 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
210 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
211 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
212 MVT::i1, MVT::v32i32},
216 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
217 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
218 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
219 MVT::i1, MVT::v32i32},
226 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
227 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
228 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
229 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
230 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
288 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
295 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
296 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
297 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
300 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
301 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
302 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
306 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
307 MVT::v3i16, MVT::v4i16, MVT::Other},
312 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
328 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
329 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
330 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
331 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
332 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
333 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
334 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
335 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
367 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
381 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
395 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
409 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
423 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
438 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
439 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
442 if (Subtarget->hasPkMovB32()) {
463 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
464 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
469 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
473 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
474 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
475 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
476 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
500 if (Subtarget->hasSMemRealTime() ||
505 if (Subtarget->has16BitInsts()) {
512 if (Subtarget->hasMadMacF32Insts())
515 if (!Subtarget->hasBFI())
519 if (!Subtarget->hasBCNT(32))
522 if (!Subtarget->hasBCNT(64))
525 if (Subtarget->hasFFBH())
528 if (Subtarget->hasFFBL())
539 if (Subtarget->hasBFE())
543 if (Subtarget->hasIntClamp())
546 if (Subtarget->hasAddNoCarry())
551 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
552 {MVT::f32, MVT::f64},
Custom);
558 {MVT::f32, MVT::f64},
Legal);
560 if (Subtarget->haveRoundOpsF64())
583 if (Subtarget->has16BitInsts()) {
632 ISD::FSIN, ISD::FROUND},
636 if (Subtarget->hasBF16TransInsts())
655 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
656 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
657 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
790 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
791 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
792 MVT::v32f16, MVT::v32bf16},
796 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
802 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
806 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
810 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
811 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
819 if (Subtarget->hasVOP3PInsts()) {
830 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
833 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
834 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
835 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
838 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
846 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
852 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
853 {MVT::v2f16, MVT::v4f16},
Custom);
859 if (Subtarget->hasBF16PackedInsts()) {
860 for (
MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
866 if (Subtarget->hasPackedFP32Ops()) {
870 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
877 if (Subtarget->has16BitInsts()) {
890 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
891 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
892 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
893 MVT::v32f16, MVT::v32bf16},
898 if (Subtarget->hasVectorMulU64())
900 else if (Subtarget->hasScalarSMulU64())
903 if (Subtarget->hasMad64_32())
906 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
909 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
911 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
914 if (Subtarget->hasMinimum3Maximum3F32())
917 if (Subtarget->hasMinimum3Maximum3PKF16()) {
921 if (!Subtarget->hasMinimum3Maximum3F16())
926 if (Subtarget->hasVOP3PInsts()) {
929 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
933 if (Subtarget->hasIntMinMax64())
938 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
939 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
944 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
945 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
946 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
947 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
951 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
952 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
953 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
954 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
969 if (Subtarget->hasBF16ConversionInsts()) {
974 if (Subtarget->hasBF16PackedInsts()) {
980 if (Subtarget->hasBF16TransInsts()) {
984 if (Subtarget->hasCvtPkF16F32Inst()) {
986 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1036 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1045 ISD::ATOMIC_CMP_SWAP,
1046 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1048 ISD::ATOMIC_LOAD_ADD,
1049 ISD::ATOMIC_LOAD_SUB,
1050 ISD::ATOMIC_LOAD_AND,
1051 ISD::ATOMIC_LOAD_OR,
1052 ISD::ATOMIC_LOAD_XOR,
1053 ISD::ATOMIC_LOAD_NAND,
1054 ISD::ATOMIC_LOAD_MIN,
1055 ISD::ATOMIC_LOAD_MAX,
1056 ISD::ATOMIC_LOAD_UMIN,
1057 ISD::ATOMIC_LOAD_UMAX,
1058 ISD::ATOMIC_LOAD_FADD,
1059 ISD::ATOMIC_LOAD_FMIN,
1060 ISD::ATOMIC_LOAD_FMAX,
1061 ISD::ATOMIC_LOAD_UINC_WRAP,
1062 ISD::ATOMIC_LOAD_UDEC_WRAP,
1063 ISD::ATOMIC_LOAD_USUB_COND,
1064 ISD::ATOMIC_LOAD_USUB_SAT,
1077 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1090 EVT DestVT,
EVT SrcVT)
const {
1092 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1093 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1095 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1102 LLT DestTy,
LLT SrcTy)
const {
1103 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1104 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1106 SrcTy.getScalarSizeInBits() == 16 &&
1127 if (Subtarget->has16BitInsts()) {
1130 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1132 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1136 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1158 if (
Size == 16 && Subtarget->has16BitInsts())
1159 return (NumElts + 1) / 2;
1165 return NumElts * ((
Size + 31) / 32);
1174 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1182 if (
Size == 16 && Subtarget->has16BitInsts()) {
1183 if (ScalarVT == MVT::bf16) {
1184 RegisterVT = MVT::i32;
1185 IntermediateVT = MVT::v2bf16;
1187 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1188 IntermediateVT = RegisterVT;
1190 NumIntermediates = (NumElts + 1) / 2;
1191 return NumIntermediates;
1196 IntermediateVT = RegisterVT;
1197 NumIntermediates = NumElts;
1198 return NumIntermediates;
1203 RegisterVT = MVT::i16;
1204 IntermediateVT = ScalarVT;
1205 NumIntermediates = NumElts;
1206 return NumIntermediates;
1210 RegisterVT = MVT::i32;
1211 IntermediateVT = ScalarVT;
1212 NumIntermediates = NumElts;
1213 return NumIntermediates;
1217 RegisterVT = MVT::i32;
1218 IntermediateVT = RegisterVT;
1219 NumIntermediates = NumElts * ((
Size + 31) / 32);
1220 return NumIntermediates;
1225 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1230 unsigned MaxNumLanes) {
1231 assert(MaxNumLanes != 0);
1235 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1246 unsigned MaxNumLanes) {
1252 assert(ST->getNumContainedTypes() == 2 &&
1253 ST->getContainedType(1)->isIntegerTy(32));
1267 return MVT::amdgpuBufferFatPointer;
1269 DL.getPointerSizeInBits(AS) == 192)
1270 return MVT::amdgpuBufferStridedPointer;
1279 DL.getPointerSizeInBits(AS) == 160) ||
1281 DL.getPointerSizeInBits(AS) == 192))
1288 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1289 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1290 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1292 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1293 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1294 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1295 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1296 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1298 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1299 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1300 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1301 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1302 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1304 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1305 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1306 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1307 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1308 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1347 unsigned IntrID)
const {
1349 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1367 if (RsrcIntr->IsImage) {
1382 Info.ptrVal = RsrcArg;
1385 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1394 if (RsrcIntr->IsImage) {
1395 unsigned MaxNumLanes = 4;
1410 std::numeric_limits<unsigned>::max());
1420 if (RsrcIntr->IsImage) {
1441 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1443 Info.memVT = MVT::i32;
1450 case Intrinsic::amdgcn_raw_buffer_load_lds:
1451 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1452 case Intrinsic::amdgcn_struct_buffer_load_lds:
1453 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1459 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1460 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1461 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1462 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1465 std::numeric_limits<unsigned>::max());
1475 case Intrinsic::amdgcn_ds_ordered_add:
1476 case Intrinsic::amdgcn_ds_ordered_swap: {
1489 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1490 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1493 Info.ptrVal =
nullptr;
1498 case Intrinsic::amdgcn_ds_append:
1499 case Intrinsic::amdgcn_ds_consume: {
1512 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1513 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1514 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1519 Info.memVT = MVT::i64;
1525 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1526 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1527 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1530 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1533 ->getElementType(0));
1541 case Intrinsic::amdgcn_global_atomic_fmin_num:
1542 case Intrinsic::amdgcn_global_atomic_fmax_num:
1543 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1544 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1545 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1555 case Intrinsic::amdgcn_flat_load_monitor_b32:
1556 case Intrinsic::amdgcn_flat_load_monitor_b64:
1557 case Intrinsic::amdgcn_flat_load_monitor_b128:
1558 case Intrinsic::amdgcn_global_load_monitor_b32:
1559 case Intrinsic::amdgcn_global_load_monitor_b64:
1560 case Intrinsic::amdgcn_global_load_monitor_b128:
1561 case Intrinsic::amdgcn_cluster_load_b32:
1562 case Intrinsic::amdgcn_cluster_load_b64:
1563 case Intrinsic::amdgcn_cluster_load_b128:
1564 case Intrinsic::amdgcn_ds_load_tr6_b96:
1565 case Intrinsic::amdgcn_ds_load_tr4_b64:
1566 case Intrinsic::amdgcn_ds_load_tr8_b64:
1567 case Intrinsic::amdgcn_ds_load_tr16_b128:
1568 case Intrinsic::amdgcn_global_load_tr6_b96:
1569 case Intrinsic::amdgcn_global_load_tr4_b64:
1570 case Intrinsic::amdgcn_global_load_tr_b64:
1571 case Intrinsic::amdgcn_global_load_tr_b128:
1572 case Intrinsic::amdgcn_ds_read_tr4_b64:
1573 case Intrinsic::amdgcn_ds_read_tr6_b96:
1574 case Intrinsic::amdgcn_ds_read_tr8_b64:
1575 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1583 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1584 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1585 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1593 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1594 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1595 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1603 case Intrinsic::amdgcn_ds_gws_init:
1604 case Intrinsic::amdgcn_ds_gws_barrier:
1605 case Intrinsic::amdgcn_ds_gws_sema_v:
1606 case Intrinsic::amdgcn_ds_gws_sema_br:
1607 case Intrinsic::amdgcn_ds_gws_sema_p:
1608 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1618 Info.memVT = MVT::i32;
1620 Info.align =
Align(4);
1622 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1628 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1629 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1630 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1631 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1632 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1633 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1634 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1635 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1642 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1643 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1644 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1645 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1652 case Intrinsic::amdgcn_load_to_lds:
1653 case Intrinsic::amdgcn_global_load_lds: {
1664 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1665 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1666 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1667 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1677 Info.memVT = MVT::i32;
1679 Info.align =
Align(4);
1684 case Intrinsic::amdgcn_s_prefetch_data:
1685 case Intrinsic::amdgcn_flat_prefetch:
1686 case Intrinsic::amdgcn_global_prefetch: {
1701 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1704 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1705 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1717 Type *&AccessTy)
const {
1718 Value *Ptr =
nullptr;
1719 switch (
II->getIntrinsicID()) {
1720 case Intrinsic::amdgcn_cluster_load_b128:
1721 case Intrinsic::amdgcn_cluster_load_b64:
1722 case Intrinsic::amdgcn_cluster_load_b32:
1723 case Intrinsic::amdgcn_ds_append:
1724 case Intrinsic::amdgcn_ds_consume:
1725 case Intrinsic::amdgcn_ds_load_tr8_b64:
1726 case Intrinsic::amdgcn_ds_load_tr16_b128:
1727 case Intrinsic::amdgcn_ds_load_tr4_b64:
1728 case Intrinsic::amdgcn_ds_load_tr6_b96:
1729 case Intrinsic::amdgcn_ds_read_tr4_b64:
1730 case Intrinsic::amdgcn_ds_read_tr6_b96:
1731 case Intrinsic::amdgcn_ds_read_tr8_b64:
1732 case Intrinsic::amdgcn_ds_read_tr16_b64:
1733 case Intrinsic::amdgcn_ds_ordered_add:
1734 case Intrinsic::amdgcn_ds_ordered_swap:
1735 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1736 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1737 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1738 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1739 case Intrinsic::amdgcn_flat_load_monitor_b128:
1740 case Intrinsic::amdgcn_flat_load_monitor_b32:
1741 case Intrinsic::amdgcn_flat_load_monitor_b64:
1742 case Intrinsic::amdgcn_global_atomic_fmax_num:
1743 case Intrinsic::amdgcn_global_atomic_fmin_num:
1744 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1745 case Intrinsic::amdgcn_global_load_monitor_b128:
1746 case Intrinsic::amdgcn_global_load_monitor_b32:
1747 case Intrinsic::amdgcn_global_load_monitor_b64:
1748 case Intrinsic::amdgcn_global_load_tr_b64:
1749 case Intrinsic::amdgcn_global_load_tr_b128:
1750 case Intrinsic::amdgcn_global_load_tr4_b64:
1751 case Intrinsic::amdgcn_global_load_tr6_b96:
1752 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1753 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1754 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1755 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1756 Ptr =
II->getArgOperand(0);
1758 case Intrinsic::amdgcn_load_to_lds:
1759 case Intrinsic::amdgcn_global_load_lds:
1760 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1761 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1762 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1763 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1764 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1765 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1766 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1767 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1768 Ptr =
II->getArgOperand(1);
1773 AccessTy =
II->getType();
1779 unsigned AddrSpace)
const {
1780 if (!Subtarget->hasFlatInstOffsets()) {
1791 return AM.
Scale == 0 &&
1792 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1793 AM.
BaseOffs, AddrSpace, FlatVariant));
1797 if (Subtarget->hasFlatGlobalInsts())
1800 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1813 return isLegalMUBUFAddressingMode(AM);
1816bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1827 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1839 if (AM.HasBaseReg) {
1871 return isLegalMUBUFAddressingMode(AM);
1873 if (!Subtarget->hasScalarSubwordLoads()) {
1878 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1926 return Subtarget->enableFlatScratch()
1928 : isLegalMUBUFAddressingMode(AM);
1975 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1984 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
1987 Align RequiredAlignment(
1989 if (Subtarget->hasLDSMisalignedBug() &&
Size > 32 &&
1990 Alignment < RequiredAlignment)
2005 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
2011 RequiredAlignment =
Align(4);
2013 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2029 *IsFast = (Alignment >= RequiredAlignment) ? 64
2030 : (Alignment <
Align(4)) ? 32
2037 if (!Subtarget->hasDS96AndDS128())
2043 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2052 *IsFast = (Alignment >= RequiredAlignment) ? 96
2053 : (Alignment <
Align(4)) ? 32
2060 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2066 RequiredAlignment =
Align(8);
2068 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2077 *IsFast = (Alignment >= RequiredAlignment) ? 128
2078 : (Alignment <
Align(4)) ? 32
2095 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2097 return Alignment >= RequiredAlignment ||
2098 Subtarget->hasUnalignedDSAccessEnabled();
2106 bool AlignedBy4 = Alignment >=
Align(4);
2107 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2109 *IsFast = AlignedBy4 ?
Size : 1;
2114 *IsFast = AlignedBy4;
2125 return Alignment >=
Align(4) ||
2126 Subtarget->hasUnalignedBufferAccessEnabled();
2138 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2153 return Size >= 32 && Alignment >=
Align(4);
2158 unsigned *IsFast)
const {
2160 Alignment, Flags, IsFast);
2165 const AttributeList &FuncAttributes)
const {
2171 if (
Op.size() >= 16 &&
2175 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2193 unsigned DestAS)
const {
2196 Subtarget->hasGloballyAddressableScratch()) {
2226 unsigned Index)
const {
2242 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2277 auto [InputPtrReg, RC, ArgTy] =
2287 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2293 const SDLoc &SL)
const {
2300 const SDLoc &SL)
const {
2303 std::optional<uint32_t> KnownSize =
2305 if (KnownSize.has_value())
2331 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2340SDValue SITargetLowering::lowerKernargMemParameter(
2345 MachinePointerInfo PtrInfo =
2354 int64_t OffsetDiff =
Offset - AlignDownOffset;
2360 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2370 ArgVal = DAG.
getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2371 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2376 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain,
Offset);
2381 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2390 const SDLoc &SL)
const {
2400 return DAG.
getNode(ISD::BITCAST, SL, ValVT, Val);
2459 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2462 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2463 if (ConvertedVal == ArgValue)
2464 return ConvertedVal;
2469SDValue SITargetLowering::lowerWorkGroupId(
2474 if (!Subtarget->hasClusters())
2475 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2483 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2484 SDLoc SL(ClusterIdXYZ);
2485 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2488 SDValue ClusterWorkGroupIdXYZ =
2489 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2499 return ClusterIdXYZ;
2501 using namespace AMDGPU::Hwreg;
2505 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2516SDValue SITargetLowering::getPreloadedValue(
2519 const ArgDescriptor *
Reg =
nullptr;
2520 const TargetRegisterClass *RC;
2524 const ArgDescriptor WorkGroupIDX =
2532 const ArgDescriptor WorkGroupIDZ =
2534 const ArgDescriptor ClusterWorkGroupIDX =
2536 const ArgDescriptor ClusterWorkGroupIDY =
2538 const ArgDescriptor ClusterWorkGroupIDZ =
2540 const ArgDescriptor ClusterWorkGroupMaxIDX =
2542 const ArgDescriptor ClusterWorkGroupMaxIDY =
2544 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2546 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2549 auto LoadConstant = [&](
unsigned N) {
2553 if (Subtarget->hasArchitectedSGPRs() &&
2560 Reg = &WorkGroupIDX;
2561 RC = &AMDGPU::SReg_32RegClass;
2565 Reg = &WorkGroupIDY;
2566 RC = &AMDGPU::SReg_32RegClass;
2570 Reg = &WorkGroupIDZ;
2571 RC = &AMDGPU::SReg_32RegClass;
2575 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2576 return LoadConstant(0);
2577 Reg = &ClusterWorkGroupIDX;
2578 RC = &AMDGPU::SReg_32RegClass;
2582 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2583 return LoadConstant(0);
2584 Reg = &ClusterWorkGroupIDY;
2585 RC = &AMDGPU::SReg_32RegClass;
2589 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2590 return LoadConstant(0);
2591 Reg = &ClusterWorkGroupIDZ;
2592 RC = &AMDGPU::SReg_32RegClass;
2597 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2598 Reg = &ClusterWorkGroupMaxIDX;
2599 RC = &AMDGPU::SReg_32RegClass;
2604 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2605 Reg = &ClusterWorkGroupMaxIDY;
2606 RC = &AMDGPU::SReg_32RegClass;
2611 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2612 Reg = &ClusterWorkGroupMaxIDZ;
2613 RC = &AMDGPU::SReg_32RegClass;
2617 Reg = &ClusterWorkGroupMaxFlatID;
2618 RC = &AMDGPU::SReg_32RegClass;
2649 for (
unsigned I = 0,
E = Ins.
size(), PSInputNum = 0;
I !=
E; ++
I) {
2653 "vector type argument should have been split");
2658 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2666 "unexpected vector split in ps argument type");
2680 Info->markPSInputAllocated(PSInputNum);
2682 Info->markPSInputEnabled(PSInputNum);
2698 if (Info.hasWorkItemIDX()) {
2704 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2708 if (Info.hasWorkItemIDY()) {
2709 assert(Info.hasWorkItemIDX());
2710 if (Subtarget->hasPackedTID()) {
2711 Info.setWorkItemIDY(
2714 unsigned Reg = AMDGPU::VGPR1;
2722 if (Info.hasWorkItemIDZ()) {
2723 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2724 if (Subtarget->hasPackedTID()) {
2725 Info.setWorkItemIDZ(
2728 unsigned Reg = AMDGPU::VGPR2;
2748 if (RegIdx == ArgVGPRs.
size()) {
2755 unsigned Reg = ArgVGPRs[RegIdx];
2767 unsigned NumArgRegs) {
2770 if (RegIdx == ArgSGPRs.
size())
2773 unsigned Reg = ArgSGPRs[RegIdx];
2815 const unsigned Mask = 0x3ff;
2818 if (Info.hasWorkItemIDX()) {
2820 Info.setWorkItemIDX(Arg);
2823 if (Info.hasWorkItemIDY()) {
2825 Info.setWorkItemIDY(Arg);
2828 if (Info.hasWorkItemIDZ())
2840 const unsigned Mask = 0x3ff;
2849 auto &
ArgInfo = Info.getArgInfo();
2861 if (Info.hasImplicitArgPtr())
2869 if (Info.hasWorkGroupIDX())
2872 if (Info.hasWorkGroupIDY())
2875 if (Info.hasWorkGroupIDZ())
2878 if (Info.hasLDSKernelId())
2889 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2890 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2896 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2897 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2902 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2903 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2909 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2915 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
2924 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2929 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
2930 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2935 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
2936 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2951 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2953 bool InPreloadSequence =
true;
2955 bool AlignedForImplictArgs =
false;
2956 unsigned ImplicitArgOffset = 0;
2957 for (
auto &Arg :
F.args()) {
2958 if (!InPreloadSequence || !Arg.hasInRegAttr())
2961 unsigned ArgIdx = Arg.getArgNo();
2964 if (InIdx < Ins.
size() &&
2965 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2968 for (; InIdx < Ins.
size() && Ins[InIdx].isOrigArg() &&
2969 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2971 assert(ArgLocs[ArgIdx].isMemLoc());
2972 auto &ArgLoc = ArgLocs[InIdx];
2974 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2976 unsigned NumAllocSGPRs =
2977 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2980 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2981 if (!AlignedForImplictArgs) {
2983 alignTo(LastExplicitArgOffset,
2984 Subtarget->getAlignmentForImplicitArgPtr()) -
2985 LastExplicitArgOffset;
2986 AlignedForImplictArgs =
true;
2988 ArgOffset += ImplicitArgOffset;
2992 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2993 assert(InIdx >= 1 &&
"No previous SGPR");
2994 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2995 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2999 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3000 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
3003 InPreloadSequence =
false;
3009 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3011 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3013 if (PreloadRegs->
size() > 1)
3014 RC = &AMDGPU::SGPR_32RegClass;
3015 for (
auto &Reg : *PreloadRegs) {
3021 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3030 if (Info.hasLDSKernelId()) {
3031 Register Reg = Info.addLDSKernelId();
3032 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3041 bool IsShader)
const {
3042 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3043 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3049 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3051 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3055 unsigned NumRequiredSystemSGPRs =
3056 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3057 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3058 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3059 Register Reg = Info.addReservedUserSGPR();
3060 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3065 if (!HasArchitectedSGPRs) {
3066 if (Info.hasWorkGroupIDX()) {
3067 Register Reg = Info.addWorkGroupIDX();
3068 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3072 if (Info.hasWorkGroupIDY()) {
3073 Register Reg = Info.addWorkGroupIDY();
3074 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3078 if (Info.hasWorkGroupIDZ()) {
3079 Register Reg = Info.addWorkGroupIDZ();
3080 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3085 if (Info.hasWorkGroupInfo()) {
3086 Register Reg = Info.addWorkGroupInfo();
3087 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3091 if (Info.hasPrivateSegmentWaveByteOffset()) {
3093 unsigned PrivateSegmentWaveByteOffsetReg;
3096 PrivateSegmentWaveByteOffsetReg =
3097 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3101 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3103 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3106 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3108 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3109 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3112 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3113 Info.getNumPreloadedSGPRs() >= 16);
3128 if (HasStackObjects)
3129 Info.setHasNonSpillStackObjects(
true);
3134 HasStackObjects =
true;
3138 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3140 if (!ST.enableFlatScratch()) {
3141 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3148 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3150 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3160 Info.setScratchRSrcReg(ReservedBufferReg);
3179 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
3180 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3187 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3188 if (!
MRI.isLiveIn(
Reg)) {
3189 Info.setStackPtrOffsetReg(
Reg);
3194 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3201 if (ST.getFrameLowering()->hasFP(MF)) {
3202 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3218 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3227 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3228 RC = &AMDGPU::SGPR_64RegClass;
3229 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3230 RC = &AMDGPU::SGPR_32RegClass;
3236 Entry->addLiveIn(*
I);
3241 for (
auto *Exit : Exits)
3243 TII->get(TargetOpcode::COPY), *
I)
3258 bool IsError =
false;
3262 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3280 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3281 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3283 if (!Subtarget->enableFlatScratch())
3288 !Subtarget->hasArchitectedSGPRs())
3289 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3290 !Info->hasWorkGroupIDZ());
3293 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3311 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3312 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3315 Info->markPSInputAllocated(0);
3316 Info->markPSInputEnabled(0);
3318 if (Subtarget->isAmdPalOS()) {
3327 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3328 if ((PsInputBits & 0x7F) == 0 ||
3329 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3332 }
else if (IsKernel) {
3333 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3345 if (IsKernel && Subtarget->hasKernargPreload())
3349 }
else if (!IsGraphics) {
3354 if (!Subtarget->enableFlatScratch())
3366 Info->setNumWaveDispatchSGPRs(
3368 Info->setNumWaveDispatchVGPRs(
3370 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3371 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3376 if (IsWholeWaveFunc) {
3378 {MVT::i1, MVT::Other}, Chain);
3390 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.
size(), ArgIdx = 0; i != e;
3401 if (IsEntryFunc && VA.
isMemLoc()) {
3424 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3428 int64_t OffsetDiff =
Offset - AlignDownOffset;
3435 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3445 ArgVal = DAG.
getNode(ISD::BITCAST,
DL, MemVT, ArgVal);
3446 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3447 Ins[i].Flags.isSExt(), &Ins[i]);
3455 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3458 if (PreloadRegs.
size() == 1) {
3459 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3464 TRI->getRegSizeInBits(*RC)));
3472 for (
auto Reg : PreloadRegs) {
3479 PreloadRegs.size()),
3496 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3497 Ins[i].Flags.isSExt(), &Ins[i]);
3509 "hidden argument in kernel signature was not preloaded",
3515 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3516 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3536 if (!IsEntryFunc && VA.
isMemLoc()) {
3537 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3548 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3549 RC = &AMDGPU::VGPR_32RegClass;
3550 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3551 RC = &AMDGPU::SGPR_32RegClass;
3571 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3580 auto &ArgUsageInfo =
3583 }
else if (
auto *MFAM = DAG.
getMFAM()) {
3585 auto *ArgUsageInfo =
3587 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3589 ArgUsageInfo->setFuncArgInfo(Fn, Info->getArgInfo());
3593 Info->setBytesInStackArgArea(StackArgSize);
3595 return Chains.
empty() ? Chain
3604 const Type *RetTy)
const {
3612 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3617 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3618 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3619 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3620 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3643 Info->setIfReturnsVoid(Outs.
empty());
3644 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3663 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3664 ++
I, ++RealRVLocIdx) {
3668 SDValue Arg = OutVals[RealRVLocIdx];
3691 ReadFirstLane, Arg);
3698 if (!Info->isEntryFunction()) {
3704 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3706 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3719 unsigned Opc = AMDGPUISD::ENDPGM;
3721 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3722 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3723 : AMDGPUISD::RET_GLUE;
3805 auto &ArgUsageInfo =
3808 &ArgUsageInfo.getArgUsageInfo().lookupFuncArgInfo(*CalleeFunc);
3809 }
else if (
auto *MFAM = DAG.
getMFAM()) {
3811 auto *ArgUsageInfo =
3816 CalleeArgInfo = &ArgUsageInfo->lookupFuncArgInfo(*CalleeFunc);
3844 const auto [OutgoingArg, ArgRC, ArgTy] =
3849 const auto [IncomingArg, IncomingArgRC, Ty] =
3851 assert(IncomingArgRC == ArgRC);
3854 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3862 InputReg = getImplicitArgPtr(DAG,
DL);
3864 std::optional<uint32_t> Id =
3866 if (Id.has_value()) {
3877 if (OutgoingArg->isRegister()) {
3878 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3879 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3882 unsigned SpecialArgOffset =
3893 auto [OutgoingArg, ArgRC, Ty] =
3896 std::tie(OutgoingArg, ArgRC, Ty) =
3899 std::tie(OutgoingArg, ArgRC, Ty) =
3914 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3915 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3916 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3921 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3929 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
3939 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
3948 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3949 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3960 : IncomingArgY ? *IncomingArgY
3967 if (OutgoingArg->isRegister()) {
3969 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3995 if (Callee->isDivergent())
4002 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
4006 if (!CallerPreserved)
4009 bool CCMatch = CallerCC == CalleeCC;
4022 if (Arg.hasByValAttr())
4036 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
4037 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4046 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4059 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4061 if (!CCVA.isRegLoc())
4066 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4068 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4092enum ChainCallArgIdx {
4114 bool UsesDynamicVGPRs =
false;
4115 if (IsChainCallConv) {
4120 auto RequestedExecIt =
4122 return Arg.OrigArgIndex == 2;
4124 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4126 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4129 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4132 "Haven't popped all the special args");
4135 CLI.
Args[ChainCallArgIdx::Exec];
4136 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4144 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4146 ChainCallSpecialArgs.
push_back(Arg.Node);
4149 PushNodeOrTargetConstant(RequestedExecArg);
4155 if (FlagsValue.
isZero()) {
4156 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4158 "no additional args allowed if flags == 0");
4160 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4164 if (!Subtarget->isWave32()) {
4166 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4169 UsesDynamicVGPRs =
true;
4170 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4171 CLI.
Args.end(), PushNodeOrTargetConstant);
4180 bool IsSibCall =
false;
4194 "unsupported call to variadic function ");
4202 "unsupported required tail call to function ");
4207 Outs, OutVals, Ins, DAG);
4211 "site marked musttail or on llvm.amdgcn.cs.chain");
4218 if (!TailCallOpt && IsTailCall)
4258 auto *
TRI = Subtarget->getRegisterInfo();
4265 if (!IsSibCall || IsChainCallConv) {
4266 if (!Subtarget->enableFlatScratch()) {
4272 RegsToPass.emplace_back(IsChainCallConv
4273 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4274 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4281 const unsigned NumSpecialInputs = RegsToPass.size();
4283 MVT PtrVT = MVT::i32;
4286 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4314 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4322 int32_t
Offset = LocMemOffset;
4329 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4335 ? Flags.getNonZeroByValAlign()
4362 if (Outs[i].Flags.isByVal()) {
4364 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4367 Outs[i].Flags.getNonZeroByValAlign(),
4369 nullptr, std::nullopt, DstInfo,
4375 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4381 if (!MemOpChains.
empty())
4389 TokenGlue = DAG.
getNode(ISD::CONVERGENCECTRL_GLUE,
DL, MVT::Glue,
4397 unsigned ArgIdx = 0;
4398 for (
auto [Reg, Val] : RegsToPass) {
4399 if (ArgIdx++ >= NumSpecialInputs &&
4400 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4426 if (IsTailCall && !IsSibCall) {
4431 std::vector<SDValue>
Ops({Chain});
4437 Ops.push_back(Callee);
4454 Ops.push_back(Callee);
4465 if (IsChainCallConv)
4470 for (
auto &[Reg, Val] : RegsToPass)
4474 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4475 assert(Mask &&
"Missing call preserved mask for calling convention");
4485 MVT::Glue, GlueOps),
4490 Ops.push_back(InGlue);
4496 unsigned OPC = AMDGPUISD::TC_RETURN;
4499 OPC = AMDGPUISD::TC_RETURN_GFX;
4503 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4504 : AMDGPUISD::TC_RETURN_CHAIN;
4510 if (Info->isWholeWaveFunction())
4511 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4518 Chain =
Call.getValue(0);
4519 InGlue =
Call.getValue(1);
4521 uint64_t CalleePopBytes = NumBytes;
4542 EVT VT =
Op.getValueType();
4556 "Stack grows upwards for AMDGPU");
4558 Chain = BaseAddr.getValue(1);
4560 if (Alignment > StackAlign) {
4562 << Subtarget->getWavefrontSizeLog2();
4563 uint64_t StackAlignMask = ScaledAlignment - 1;
4570 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4576 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4587 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4603 if (
Op.getValueType() != MVT::i32)
4622 assert(
Op.getValueType() == MVT::i32);
4631 Op.getOperand(0), IntrinID, GetRoundBothImm);
4665 SDValue RoundModeTimesNumBits =
4685 TableEntry, EnumOffset);
4701 static_cast<uint32_t>(ConstMode->getZExtValue()),
4713 if (UseReducedTable) {
4719 SDValue RoundModeTimesNumBits =
4739 SDValue RoundModeTimesNumBits =
4748 NewMode = TruncTable;
4757 ReadFirstLaneID, NewMode);
4770 IntrinID, RoundBothImm, NewMode);
4776 if (
Op->isDivergent() &&
4777 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4787 if (Subtarget->hasSafeSmemPrefetch())
4795 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4804 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4805 EVT SrcVT = Src.getValueType();
4814 EVT DstVT =
Op.getValueType();
4818 return DAG.
getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4823 if (
Op.getValueType() != MVT::i64)
4837 Op.getOperand(0), IntrinID, ModeHwRegImm);
4839 Op.getOperand(0), IntrinID, TrapHwRegImm);
4846 SDValue Result = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4853 if (
Op.getOperand(1).getValueType() != MVT::i64)
4865 ReadFirstLaneID, NewModeReg);
4867 ReadFirstLaneID, NewTrapReg);
4869 unsigned ModeHwReg =
4872 unsigned TrapHwReg =
4880 IntrinID, ModeHwRegImm, NewModeReg);
4883 IntrinID, TrapHwRegImm, NewTrapReg);
4892 .
Case(
"m0", AMDGPU::M0)
4893 .
Case(
"exec", AMDGPU::EXEC)
4894 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4895 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4896 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4897 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4898 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4903 if (!Subtarget->hasFlatScrRegister() &&
4904 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4906 "\" for subtarget."));
4911 case AMDGPU::EXEC_LO:
4912 case AMDGPU::EXEC_HI:
4913 case AMDGPU::FLAT_SCR_LO:
4914 case AMDGPU::FLAT_SCR_HI:
4919 case AMDGPU::FLAT_SCR:
4938 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4947static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4969 auto Next = std::next(
I);
4980 MBB.addSuccessor(LoopBB);
4982 return std::pair(LoopBB, RemainderBB);
4989 auto I =
MI.getIterator();
4990 auto E = std::next(
I);
5012 Src->setIsKill(
false);
5022 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
5028 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5031 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5055 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5056 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5066 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
5067 Register NewExec =
MRI.createVirtualRegister(BoolRC);
5069 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5070 Register CondReg =
MRI.createVirtualRegister(BoolRC);
5078 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5085 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5089 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5097 MRI.setSimpleHint(NewExec, CondReg);
5099 if (UseGPRIdxMode) {
5101 SGPRIdxReg = CurrentIdxReg;
5103 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5104 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5114 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5145 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5146 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5154 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5156 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
5157 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
5173 InitResultReg, DstReg, PhiReg, TmpExec,
5174 Offset, UseGPRIdxMode, SGPRIdxReg);
5180 LoopBB->removeSuccessor(RemainderBB);
5182 LoopBB->addSuccessor(LandingPad);
5193static std::pair<unsigned, int>
5197 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5202 return std::pair(AMDGPU::sub0,
Offset);
5242 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5259 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5260 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5269 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5272 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5276 if (UseGPRIdxMode) {
5283 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5296 MI.eraseFromParent();
5305 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5306 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5312 UseGPRIdxMode, SGPRIdxReg);
5316 if (UseGPRIdxMode) {
5318 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5320 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5325 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5330 MI.eraseFromParent();
5347 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5357 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5359 if (Idx->
getReg() == AMDGPU::NoRegister) {
5370 MI.eraseFromParent();
5375 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5379 if (UseGPRIdxMode) {
5383 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5392 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5393 TRI.getRegSizeInBits(*VecRC), 32,
false);
5399 MI.eraseFromParent();
5409 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5413 UseGPRIdxMode, SGPRIdxReg);
5416 if (UseGPRIdxMode) {
5418 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5420 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5426 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5427 TRI.getRegSizeInBits(*VecRC), 32,
false);
5428 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5434 MI.eraseFromParent();
5450 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5451 if (ST.hasScalarAddSub64()) {
5452 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5462 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5463 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5466 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5468 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5471 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5473 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5475 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5476 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5485 MI.eraseFromParent();
5491 case AMDGPU::S_MIN_U32:
5492 return std::numeric_limits<uint32_t>::max();
5493 case AMDGPU::S_MIN_I32:
5494 return std::numeric_limits<int32_t>::max();
5495 case AMDGPU::S_MAX_U32:
5496 return std::numeric_limits<uint32_t>::min();
5497 case AMDGPU::S_MAX_I32:
5498 return std::numeric_limits<int32_t>::min();
5499 case AMDGPU::V_ADD_F32_e64:
5501 case AMDGPU::V_SUB_F32_e64:
5503 case AMDGPU::S_ADD_I32:
5504 case AMDGPU::S_SUB_I32:
5505 case AMDGPU::S_OR_B32:
5506 case AMDGPU::S_XOR_B32:
5507 return std::numeric_limits<uint32_t>::min();
5508 case AMDGPU::S_AND_B32:
5509 return std::numeric_limits<uint32_t>::max();
5510 case AMDGPU::V_MIN_F32_e64:
5511 case AMDGPU::V_MAX_F32_e64:
5515 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5521 case AMDGPU::V_CMP_LT_U64_e64:
5522 return std::numeric_limits<uint64_t>::max();
5523 case AMDGPU::V_CMP_LT_I64_e64:
5524 return std::numeric_limits<int64_t>::max();
5525 case AMDGPU::V_CMP_GT_U64_e64:
5526 return std::numeric_limits<uint64_t>::min();
5527 case AMDGPU::V_CMP_GT_I64_e64:
5528 return std::numeric_limits<int64_t>::min();
5529 case AMDGPU::S_ADD_U64_PSEUDO:
5530 case AMDGPU::S_SUB_U64_PSEUDO:
5531 case AMDGPU::S_OR_B64:
5532 case AMDGPU::S_XOR_B64:
5533 return std::numeric_limits<uint64_t>::min();
5534 case AMDGPU::S_AND_B64:
5535 return std::numeric_limits<uint64_t>::max();
5538 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5543 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5544 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5545 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5546 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5547 Opc == AMDGPU::S_XOR_B32 ||
Opc == AMDGPU::V_MIN_F32_e64 ||
5548 Opc == AMDGPU::V_MAX_F32_e64 ||
Opc == AMDGPU::V_ADD_F32_e64 ||
5549 Opc == AMDGPU::V_SUB_F32_e64;
5553 return Opc == AMDGPU::V_MIN_F32_e64 ||
Opc == AMDGPU::V_MAX_F32_e64 ||
5554 Opc == AMDGPU::V_ADD_F32_e64 ||
Opc == AMDGPU::V_SUB_F32_e64;
5568 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5573 case AMDGPU::S_MIN_U32:
5574 case AMDGPU::S_MIN_I32:
5575 case AMDGPU::V_MIN_F32_e64:
5576 case AMDGPU::S_MAX_U32:
5577 case AMDGPU::S_MAX_I32:
5578 case AMDGPU::V_MAX_F32_e64:
5579 case AMDGPU::S_AND_B32:
5580 case AMDGPU::S_OR_B32: {
5586 case AMDGPU::V_CMP_LT_U64_e64:
5587 case AMDGPU::V_CMP_LT_I64_e64:
5588 case AMDGPU::V_CMP_GT_U64_e64:
5589 case AMDGPU::V_CMP_GT_I64_e64:
5590 case AMDGPU::S_AND_B64:
5591 case AMDGPU::S_OR_B64: {
5597 case AMDGPU::S_XOR_B32:
5598 case AMDGPU::S_XOR_B64:
5599 case AMDGPU::S_ADD_I32:
5600 case AMDGPU::S_ADD_U64_PSEUDO:
5601 case AMDGPU::V_ADD_F32_e64:
5602 case AMDGPU::S_SUB_I32:
5603 case AMDGPU::S_SUB_U64_PSEUDO:
5604 case AMDGPU::V_SUB_F32_e64: {
5607 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5609 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5611 bool IsWave32 = ST.isWave32();
5612 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5613 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5614 unsigned BitCountOpc =
5615 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5619 auto NewAccumulator =
5624 case AMDGPU::S_XOR_B32:
5625 case AMDGPU::S_XOR_B64: {
5631 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5634 .
addReg(NewAccumulator->getOperand(0).getReg())
5637 if (
Opc == AMDGPU::S_XOR_B32) {
5643 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5645 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5649 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5652 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5654 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5664 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5672 case AMDGPU::S_SUB_I32: {
5673 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5681 .
addReg(NewAccumulator->getOperand(0).getReg());
5684 case AMDGPU::S_ADD_I32: {
5687 .
addReg(NewAccumulator->getOperand(0).getReg());
5690 case AMDGPU::S_ADD_U64_PSEUDO:
5691 case AMDGPU::S_SUB_U64_PSEUDO: {
5692 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5693 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5695 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5697 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5698 Register CarryReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5699 Register AddReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5701 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5703 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5707 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5710 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5712 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5714 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5717 .
addReg(NewAccumulator->getOperand(0).getReg())
5727 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5729 : NewAccumulator->getOperand(0).getReg();
5740 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5746 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5752 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5759 case AMDGPU::V_ADD_F32_e64:
5760 case AMDGPU::V_SUB_F32_e64: {
5762 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5763 Register DstVreg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5767 .
addReg(NewAccumulator->getOperand(0).getReg())
5772 unsigned srcMod =
Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
5780 BuildMI(BB,
MI,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5809 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5810 Register IdentityValReg =
MRI.createVirtualRegister(DstRegClass);
5811 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5812 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5813 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5814 Register FF1Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5815 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
5817 bool IsWave32 = ST.isWave32();
5818 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5819 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5826 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5830 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5839 I = ComputeLoop->begin();
5841 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5845 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5849 I = ComputeLoop->end();
5852 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5856 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5862 MRI.createVirtualRegister(
MRI.getRegClass(SrcReg));
5863 Register DstVreg =
MRI.createVirtualRegister(
MRI.getRegClass(SrcReg));
5865 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_MOV_B32_e32),
5875 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5876 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5885 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5887 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5888 Register LaneValReg =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5891 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5893 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5895 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5897 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5901 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5905 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5906 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5912 case AMDGPU::S_OR_B64:
5913 case AMDGPU::S_AND_B64:
5914 case AMDGPU::S_XOR_B64: {
5917 .
addReg(LaneValue->getOperand(0).getReg())
5921 case AMDGPU::V_CMP_GT_I64_e64:
5922 case AMDGPU::V_CMP_GT_U64_e64:
5923 case AMDGPU::V_CMP_LT_I64_e64:
5924 case AMDGPU::V_CMP_LT_U64_e64: {
5925 Register LaneMaskReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5927 MRI.createVirtualRegister(WaveMaskRegClass);
5930 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5931 Register AccumulatorVReg =
MRI.createVirtualRegister(VregClass);
5934 VregClass, AMDGPU::sub0, VSubRegClass);
5937 VregClass, AMDGPU::sub1, VSubRegClass);
5938 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
5945 .
addReg(LaneValue->getOperand(0).getReg())
5946 .
addReg(AccumulatorVReg);
5948 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5949 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
5953 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5954 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5955 .
addReg(LaneValue->getOperand(0).getReg())
5959 case AMDGPU::S_ADD_U64_PSEUDO:
5960 case AMDGPU::S_SUB_U64_PSEUDO: {
5963 .
addReg(LaneValue->getOperand(0).getReg());
5970 unsigned BITSETOpc =
5971 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5972 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5978 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5981 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5983 .
addReg(NewActiveBitsReg)
5985 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5990 MI.eraseFromParent();
6005 switch (
MI.getOpcode()) {
6006 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6008 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6010 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6012 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6014 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6016 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6018 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6020 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6022 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6024 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6026 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6028 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6030 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6032 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6034 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6036 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6038 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6040 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6042 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6044 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6046 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6048 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6050 case AMDGPU::S_UADDO_PSEUDO:
6051 case AMDGPU::S_USUBO_PSEUDO: {
6057 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6059 : AMDGPU::S_SUB_U32;
6067 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6070 MI.eraseFromParent();
6073 case AMDGPU::S_ADD_U64_PSEUDO:
6074 case AMDGPU::S_SUB_U64_PSEUDO: {
6077 case AMDGPU::V_ADD_U64_PSEUDO:
6078 case AMDGPU::V_SUB_U64_PSEUDO: {
6079 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6085 if (ST.hasAddSubU64Insts()) {
6087 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6088 : AMDGPU::V_SUB_U64_e64),
6093 TII->legalizeOperands(*
I);
6094 MI.eraseFromParent();
6098 if (IsAdd && ST.hasLshlAddU64Inst()) {
6104 TII->legalizeOperands(*
Add);
6105 MI.eraseFromParent();
6109 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6111 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6112 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6114 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
6115 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
6119 : &AMDGPU::VReg_64RegClass;
6122 : &AMDGPU::VReg_64RegClass;
6125 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6127 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6130 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6132 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6135 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6137 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6140 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6147 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6161 TII->legalizeOperands(*LoHalf);
6162 TII->legalizeOperands(*HiHalf);
6163 MI.eraseFromParent();
6166 case AMDGPU::S_ADD_CO_PSEUDO:
6167 case AMDGPU::S_SUB_CO_PSEUDO: {
6178 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6179 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6184 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6185 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6189 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6191 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6196 if (ST.isWave64()) {
6197 if (ST.hasScalarCompareEq64()) {
6204 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6206 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6208 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6209 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6211 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6225 unsigned Opc =
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6226 ? AMDGPU::S_ADDC_U32
6227 : AMDGPU::S_SUBB_U32;
6232 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6238 MI.eraseFromParent();
6241 case AMDGPU::SI_INIT_M0: {
6244 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6247 MI.eraseFromParent();
6250 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6253 TII->get(AMDGPU::S_CMP_EQ_U32))
6258 case AMDGPU::GET_GROUPSTATICSIZE: {
6262 .
add(
MI.getOperand(0))
6264 MI.eraseFromParent();
6267 case AMDGPU::GET_SHADERCYCLESHILO: {
6280 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6282 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6283 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6285 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6286 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6288 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6292 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6297 .
add(
MI.getOperand(0))
6302 MI.eraseFromParent();
6305 case AMDGPU::SI_INDIRECT_SRC_V1:
6306 case AMDGPU::SI_INDIRECT_SRC_V2:
6307 case AMDGPU::SI_INDIRECT_SRC_V4:
6308 case AMDGPU::SI_INDIRECT_SRC_V8:
6309 case AMDGPU::SI_INDIRECT_SRC_V9:
6310 case AMDGPU::SI_INDIRECT_SRC_V10:
6311 case AMDGPU::SI_INDIRECT_SRC_V11:
6312 case AMDGPU::SI_INDIRECT_SRC_V12:
6313 case AMDGPU::SI_INDIRECT_SRC_V16:
6314 case AMDGPU::SI_INDIRECT_SRC_V32:
6316 case AMDGPU::SI_INDIRECT_DST_V1:
6317 case AMDGPU::SI_INDIRECT_DST_V2:
6318 case AMDGPU::SI_INDIRECT_DST_V4:
6319 case AMDGPU::SI_INDIRECT_DST_V8:
6320 case AMDGPU::SI_INDIRECT_DST_V9:
6321 case AMDGPU::SI_INDIRECT_DST_V10:
6322 case AMDGPU::SI_INDIRECT_DST_V11:
6323 case AMDGPU::SI_INDIRECT_DST_V12:
6324 case AMDGPU::SI_INDIRECT_DST_V16:
6325 case AMDGPU::SI_INDIRECT_DST_V32:
6327 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6328 case AMDGPU::SI_KILL_I1_PSEUDO:
6330 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6334 Register SrcCond =
MI.getOperand(3).getReg();
6336 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6337 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6338 const auto *CondRC =
TRI->getWaveMaskRegClass();
6339 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
6343 : &AMDGPU::VReg_64RegClass;
6346 : &AMDGPU::VReg_64RegClass;
6349 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6351 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6354 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6356 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6359 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6361 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6382 MI.eraseFromParent();
6385 case AMDGPU::SI_BR_UNDEF: {
6387 .
add(
MI.getOperand(0));
6389 MI.eraseFromParent();
6392 case AMDGPU::ADJCALLSTACKUP:
6393 case AMDGPU::ADJCALLSTACKDOWN: {
6400 case AMDGPU::SI_CALL_ISEL: {
6401 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
6404 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6410 MI.eraseFromParent();
6413 case AMDGPU::V_ADD_CO_U32_e32:
6414 case AMDGPU::V_SUB_CO_U32_e32:
6415 case AMDGPU::V_SUBREV_CO_U32_e32: {
6417 unsigned Opc =
MI.getOpcode();
6419 bool NeedClampOperand =
false;
6420 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
6422 NeedClampOperand =
true;
6426 if (
TII->isVOP3(*
I)) {
6429 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
6430 if (NeedClampOperand)
6433 TII->legalizeOperands(*
I);
6435 MI.eraseFromParent();
6438 case AMDGPU::V_ADDC_U32_e32:
6439 case AMDGPU::V_SUBB_U32_e32:
6440 case AMDGPU::V_SUBBREV_U32_e32:
6443 TII->legalizeOperands(
MI);
6445 case AMDGPU::DS_GWS_INIT:
6446 case AMDGPU::DS_GWS_SEMA_BR:
6447 case AMDGPU::DS_GWS_BARRIER:
6448 case AMDGPU::DS_GWS_SEMA_V:
6449 case AMDGPU::DS_GWS_SEMA_P:
6450 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6458 case AMDGPU::S_SETREG_B32: {
6474 const unsigned SetMask = WidthMask <<
Offset;
6477 unsigned SetDenormOp = 0;
6478 unsigned SetRoundOp = 0;
6486 SetRoundOp = AMDGPU::S_ROUND_MODE;
6487 SetDenormOp = AMDGPU::S_DENORM_MODE;
6489 SetRoundOp = AMDGPU::S_ROUND_MODE;
6491 SetDenormOp = AMDGPU::S_DENORM_MODE;
6494 if (SetRoundOp || SetDenormOp) {
6496 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6497 unsigned ImmVal = Def->getOperand(1).getImm();
6511 MI.eraseFromParent();
6520 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6524 case AMDGPU::S_INVERSE_BALLOT_U32:
6525 case AMDGPU::S_INVERSE_BALLOT_U64:
6528 MI.setDesc(
TII->get(AMDGPU::COPY));
6530 case AMDGPU::ENDPGM_TRAP: {
6532 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6552 MI.eraseFromParent();
6555 case AMDGPU::SIMULATED_TRAP: {
6556 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6558 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6559 MI.eraseFromParent();
6562 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6563 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6569 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6570 Register OriginalExec = Setup->getOperand(0).getReg();
6572 MI.getOperand(0).setReg(OriginalExec);
6609 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6613 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6640 if (!Subtarget->hasMadMacF32Insts())
6641 return Subtarget->hasFastFMAF32();
6647 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6650 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6666 switch (Ty.getScalarSizeInBits()) {
6684 if (Ty.getScalarSizeInBits() == 16)
6686 if (Ty.getScalarSizeInBits() == 32)
6687 return Subtarget->hasMadMacF32Insts() &&
6697 EVT VT =
N->getValueType(0);
6699 return Subtarget->hasMadMacF32Insts() &&
6701 if (VT == MVT::f16) {
6702 return Subtarget->hasMadF16() &&
6717 unsigned Opc =
Op.getOpcode();
6718 EVT VT =
Op.getValueType();
6719 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6720 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6721 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6722 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6723 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6724 VT == MVT::v32bf16);
6740 [[maybe_unused]]
EVT VT =
Op.getValueType();
6742 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6743 VT == MVT::v16i32) &&
6744 "Unexpected ValueType.");
6753 unsigned Opc =
Op.getOpcode();
6754 EVT VT =
Op.getValueType();
6755 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6756 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6757 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6758 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6759 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6760 VT == MVT::v32bf16);
6768 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6770 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6777 unsigned Opc =
Op.getOpcode();
6778 EVT VT =
Op.getValueType();
6779 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6780 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6781 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6782 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6783 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6784 VT == MVT::v32bf16);
6789 : std::pair(Op0, Op0);
6798 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
6800 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
6806 switch (
Op.getOpcode()) {
6810 return LowerBRCOND(
Op, DAG);
6812 return LowerRETURNADDR(
Op, DAG);
6815 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6816 "Load should return a value and a chain");
6820 EVT VT =
Op.getValueType();
6822 return lowerFSQRTF32(
Op, DAG);
6824 return lowerFSQRTF64(
Op, DAG);
6829 return LowerTrig(
Op, DAG);
6831 return LowerSELECT(
Op, DAG);
6833 return LowerFDIV(
Op, DAG);
6835 return LowerFFREXP(
Op, DAG);
6836 case ISD::ATOMIC_CMP_SWAP:
6837 return LowerATOMIC_CMP_SWAP(
Op, DAG);
6839 return LowerSTORE(
Op, DAG);
6843 return LowerGlobalAddress(MFI,
Op, DAG);
6846 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
6848 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
6850 return LowerINTRINSIC_VOID(
Op, DAG);
6851 case ISD::ADDRSPACECAST:
6852 return lowerADDRSPACECAST(
Op, DAG);
6854 return lowerINSERT_SUBVECTOR(
Op, DAG);
6856 return lowerINSERT_VECTOR_ELT(
Op, DAG);
6858 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
6860 return lowerVECTOR_SHUFFLE(
Op, DAG);
6862 return lowerSCALAR_TO_VECTOR(
Op, DAG);
6864 return lowerBUILD_VECTOR(
Op, DAG);
6867 return lowerFP_ROUND(
Op, DAG);
6869 return lowerTRAP(
Op, DAG);
6870 case ISD::DEBUGTRAP:
6871 return lowerDEBUGTRAP(
Op, DAG);
6880 return lowerFMINNUM_FMAXNUM(
Op, DAG);
6881 case ISD::FMINIMUMNUM:
6882 case ISD::FMAXIMUMNUM:
6883 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
6886 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
6889 return lowerFLDEXP(
Op, DAG);
6895 Op.getValueType() == MVT::i16 &&
6896 Op.getOperand(0).getValueType() == MVT::f32) {
6912 case ISD::FMINNUM_IEEE:
6913 case ISD::FMAXNUM_IEEE:
6920 return lowerFCOPYSIGN(
Op, DAG);
6922 return lowerMUL(
Op, DAG);
6925 return lowerXMULO(
Op, DAG);
6928 return lowerXMUL_LOHI(
Op, DAG);
6929 case ISD::DYNAMIC_STACKALLOC:
6931 case ISD::STACKSAVE:
6935 case ISD::SET_ROUNDING:
6939 case ISD::FP_EXTEND:
6942 case ISD::GET_FPENV:
6944 case ISD::SET_FPENV:
6963 EVT FittingLoadVT = LoadVT;
6988 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6992 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6995SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6998 bool IsIntrinsic)
const {
7001 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7002 EVT LoadVT =
M->getValueType(0);
7004 EVT EquivLoadVT = LoadVT;
7018 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
7022 M->getMemoryVT(),
M->getMemOperand());
7033 EVT LoadVT =
M->getValueType(0);
7039 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
7040 bool IsTFE =
M->getNumValues() == 3;
7042 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7043 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7044 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7045 : AMDGPUISD::BUFFER_LOAD;
7048 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG,
Ops);
7053 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
7057 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
7058 M->getMemOperand(), DAG);
7062 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
7064 M->getMemOperand(), DAG);
7072 EVT VT =
N->getValueType(0);
7073 unsigned CondCode =
N->getConstantOperandVal(3);
7084 EVT CmpVT =
LHS.getValueType();
7085 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
7086 unsigned PromoteOp =
7106 EVT VT =
N->getValueType(0);
7108 unsigned CondCode =
N->getConstantOperandVal(3);
7117 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7118 Src0 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7119 Src1 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7126 SDValue SetCC = DAG.
getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7135 EVT VT =
N->getValueType(0);
7144 Op0 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
7145 Op1 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
7159 Exec = AMDGPU::EXEC_LO;
7161 Exec = AMDGPU::EXEC;
7178 EVT VT =
N->getValueType(0);
7180 unsigned IID =
N->getConstantOperandVal(0);
7181 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7182 IID == Intrinsic::amdgcn_permlanex16;
7183 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7184 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7188 unsigned SplitSize = 32;
7189 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7190 ST->hasDPALU_DPP() &&
7198 case Intrinsic::amdgcn_permlane16:
7199 case Intrinsic::amdgcn_permlanex16:
7200 case Intrinsic::amdgcn_update_dpp:
7205 case Intrinsic::amdgcn_writelane:
7208 case Intrinsic::amdgcn_readlane:
7209 case Intrinsic::amdgcn_set_inactive:
7210 case Intrinsic::amdgcn_set_inactive_chain_arg:
7211 case Intrinsic::amdgcn_mov_dpp8:
7214 case Intrinsic::amdgcn_readfirstlane:
7215 case Intrinsic::amdgcn_permlane64:
7223 std::reverse(Operands.
begin(), Operands.
end());
7225 if (
SDNode *GL =
N->getGluedNode()) {
7226 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7227 GL = GL->getOperand(0).getNode();
7237 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7238 IID == Intrinsic::amdgcn_mov_dpp8 ||
7239 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7240 Src1 =
N->getOperand(2);
7241 if (IID == Intrinsic::amdgcn_writelane ||
7242 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7243 Src2 =
N->getOperand(3);
7246 if (ValSize == SplitSize) {
7256 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7261 if (IID == Intrinsic::amdgcn_writelane) {
7266 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7268 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7271 if (ValSize % SplitSize != 0)
7275 EVT VT =
N->getValueType(0);
7279 unsigned NumOperands =
N->getNumOperands();
7281 SDNode *GL =
N->getGluedNode();
7286 for (
unsigned i = 0; i != NE; ++i) {
7287 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7289 SDValue Operand =
N->getOperand(j);
7298 Operands[j] = Operand;
7303 Operands[NumOperands - 1] =
7304 DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7319 if (SplitSize == 32) {
7321 return unrollLaneOp(LaneOp.
getNode());
7327 unsigned SubVecNumElt =
7331 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7332 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7336 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7341 if (IID == Intrinsic::amdgcn_writelane)
7346 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7347 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7348 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7349 EltIdx += SubVecNumElt;
7363 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7366 if (IID == Intrinsic::amdgcn_writelane)
7369 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7377 switch (
N->getOpcode()) {
7389 unsigned IID =
N->getConstantOperandVal(0);
7391 case Intrinsic::amdgcn_make_buffer_rsrc:
7392 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
7394 case Intrinsic::amdgcn_cvt_pkrtz: {
7399 DAG.
getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7400 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7403 case Intrinsic::amdgcn_cvt_pknorm_i16:
7404 case Intrinsic::amdgcn_cvt_pknorm_u16:
7405 case Intrinsic::amdgcn_cvt_pk_i16:
7406 case Intrinsic::amdgcn_cvt_pk_u16: {
7412 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7413 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7414 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7415 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7416 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7417 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7419 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7421 EVT VT =
N->getValueType(0);
7426 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7430 case Intrinsic::amdgcn_s_buffer_load: {
7436 if (!Subtarget->hasScalarSubwordLoads())
7442 EVT VT =
Op.getValueType();
7443 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
7455 if (!
Offset->isDivergent()) {
7474 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
7479 case Intrinsic::amdgcn_dead: {
7480 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7491 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7492 Results.push_back(Res.getOperand(
I));
7496 Results.push_back(Res.getValue(1));
7505 EVT VT =
N->getValueType(0);
7510 EVT SelectVT = NewVT;
7511 if (NewVT.
bitsLT(MVT::i32)) {
7514 SelectVT = MVT::i32;
7520 if (NewVT != SelectVT)
7526 if (
N->getValueType(0) != MVT::v2f16)
7530 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7538 if (
N->getValueType(0) != MVT::v2f16)
7542 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7550 if (
N->getValueType(0) != MVT::f16)
7565 if (U.get() !=
Value)
7568 if (U.getUser()->getOpcode() == Opcode)
7574unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
7577 case Intrinsic::amdgcn_if:
7578 return AMDGPUISD::IF;
7579 case Intrinsic::amdgcn_else:
7580 return AMDGPUISD::ELSE;
7581 case Intrinsic::amdgcn_loop:
7582 return AMDGPUISD::LOOP;
7583 case Intrinsic::amdgcn_end_cf:
7603 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7630 SDNode *Intr = BRCOND.getOperand(1).getNode();
7647 Intr =
LHS.getNode();
7655 assert(BR &&
"brcond missing unconditional branch user");
7660 unsigned CFNode = isCFIntrinsic(Intr);
7680 Ops.push_back(Target);
7703 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
7722 MVT VT =
Op.getSimpleValueType();
7725 if (
Op.getConstantOperandVal(0) != 0)
7729 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7731 if (
Info->isEntryFunction())
7748 return Op.getValueType().bitsLE(VT)
7756 EVT DstVT =
Op.getValueType();
7763 unsigned Opc =
Op.getOpcode();
7775 EVT SrcVT = Src.getValueType();
7776 EVT DstVT =
Op.getValueType();
7779 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
7782 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
7789 if (DstVT == MVT::f16) {
7794 if (!Subtarget->has16BitInsts()) {
7797 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7799 if (
Op->getFlags().hasApproximateFuncs()) {
7806 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7810 "custom lower FP_ROUND for f16 or bf16");
7811 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
7824 EVT VT =
Op.getValueType();
7826 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7827 bool IsIEEEMode =
Info->getMode().IEEE;
7836 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7843SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
7845 EVT VT =
Op.getValueType();
7847 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7848 bool IsIEEEMode =
Info->getMode().IEEE;
7853 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7861 EVT VT =
Op.getValueType();
7865 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7866 !Subtarget->hasMinimum3Maximum3F16() &&
7867 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7868 "should not need to widen f16 minimum/maximum to v2f16");
7882 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7890 EVT VT =
Op.getValueType();
7894 EVT ExpVT =
Exp.getValueType();
7895 if (ExpVT == MVT::i16)
7916 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
7919 return DAG.
getNode(ISD::FLDEXP,
DL, VT,
Op.getOperand(0), TruncExp);
7923 switch (
Op->getOpcode()) {
7953 DAGCombinerInfo &DCI)
const {
7954 const unsigned Opc =
Op.getOpcode();
7962 :
Op->getOperand(0).getValueType();
7965 if (DCI.isBeforeLegalizeOps() ||
7969 auto &DAG = DCI.DAG;
7975 LHS =
Op->getOperand(1);
7976 RHS =
Op->getOperand(2);
7978 LHS =
Op->getOperand(0);
7979 RHS =
Op->getOperand(1);
8018 if (MagVT == SignVT)
8025 SDValue SignAsInt32 = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
8028 SDValue SignAsHalf16 = DAG.
getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
8035 EVT VT =
Op.getValueType();
8041 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
8068 if (
Op->isDivergent())
8081 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8083 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8086 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8088 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8094 EVT VT =
Op.getValueType();
8101 const APInt &
C = RHSC->getAPIntValue();
8103 if (
C.isPowerOf2()) {
8105 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
8132 if (
Op->isDivergent()) {
8136 if (Subtarget->hasSMulHi()) {
8147 if (!Subtarget->isTrapHandlerEnabled() ||
8149 return lowerTrapEndpgm(
Op, DAG);
8151 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
8152 : lowerTrapHsaQueuePtr(
Op, DAG);
8158 return DAG.
getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8162SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
8164 ImplicitParameter Param)
const {
8168 MachinePointerInfo PtrInfo =
8185 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
8188 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8191 if (UserSGPR == AMDGPU::NoRegister) {
8208 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8217 if (Subtarget->hasPrivEnabledTrap2NopBug())
8218 return DAG.
getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8222 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8230 if (!Subtarget->isTrapHandlerEnabled() ||
8234 "debugtrap handler not supported",
8242 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8245SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
8247 if (Subtarget->hasApertureRegs()) {
8249 ? AMDGPU::SRC_SHARED_BASE
8250 : AMDGPU::SRC_PRIVATE_BASE;
8251 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8252 !Subtarget->hasGloballyAddressableScratch()) &&
8253 "Cannot use src_private_base with globally addressable scratch!");
8274 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
8278 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8280 if (UserSGPR == AMDGPU::NoRegister) {
8325 const AMDGPUTargetMachine &TM =
8328 unsigned DestAS, SrcAS;
8330 bool IsNonNull =
false;
8332 SrcAS = ASC->getSrcAddressSpace();
8333 Src = ASC->getOperand(0);
8334 DestAS = ASC->getDestAddressSpace();
8337 Op.getConstantOperandVal(0) ==
8338 Intrinsic::amdgcn_addrspacecast_nonnull);
8339 Src =
Op->getOperand(1);
8340 SrcAS =
Op->getConstantOperandVal(2);
8341 DestAS =
Op->getConstantOperandVal(3);
8354 Subtarget->hasGloballyAddressableScratch()) {
8359 AMDGPU::S_MOV_B32, SL, MVT::i32,
8360 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8368 unsigned NullVal = TM.getNullPointerValue(DestAS);
8383 Subtarget->hasGloballyAddressableScratch()) {
8392 if (Subtarget->isWave64())
8398 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8401 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8406 AMDGPU::S_MOV_B64, SL, MVT::i64,
8407 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8409 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8411 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8413 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8419 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8431 Op.getValueType() == MVT::i64) {
8432 const SIMachineFunctionInfo *
Info =
8434 if (
Info->get32BitAddressHighBits() == 0)
8439 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8443 Src.getValueType() == MVT::i64)
8471 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
8476 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8478 MVT::i32, InsNumElts / 2);
8480 Vec = DAG.
getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8481 Ins = DAG.
getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8483 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
8485 if (InsNumElts == 2) {
8495 return DAG.
getNode(ISD::BITCAST, SL, VecVT, Vec);
8498 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
8521 if (NumElts == 4 && EltSize == 16 && KIdx) {
8529 SDValue LoVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8530 SDValue HiVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8532 unsigned Idx = KIdx->getZExtValue();
8533 bool InsertLo = Idx < 2;
8536 DAG.
getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8537 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8539 InsHalf = DAG.
getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8543 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8556 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
8584 return DAG.
getNode(ISD::BITCAST, SL, VecVT, BFI);
8591 EVT ResultVT =
Op.getValueType();
8604 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8607 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8611 if (VecSize == 128) {
8619 }
else if (VecSize == 256) {
8622 for (
unsigned P = 0;
P < 4; ++
P) {
8628 Parts[0], Parts[1]));
8630 Parts[2], Parts[3]));
8636 for (
unsigned P = 0;
P < 8; ++
P) {
8643 Parts[0], Parts[1], Parts[2], Parts[3]));
8646 Parts[4], Parts[5], Parts[6], Parts[7]));
8666 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8681 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8683 return DAG.
getNode(ISD::BITCAST, SL, ResultVT, Result);
8691 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8696 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8697 !(Mask[Elt + 1] & 1);
8703 EVT ResultVT =
Op.getValueType();
8706 const int NewSrcNumElts = 2;
8708 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
8724 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
8746 if (ShouldUseConsecutiveExtract &&
8749 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8750 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8762 if (Idx0 >= SrcNumElts) {
8767 if (Idx1 >= SrcNumElts) {
8772 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8773 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8781 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8782 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8787 if (SubVec0 != SubVec1) {
8788 NewMaskIdx1 += NewSrcNumElts;
8795 {NewMaskIdx0, NewMaskIdx1});
8800 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8801 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8802 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8803 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8822 EVT ResultVT =
Op.getValueType();
8838 EVT VT =
Op.getValueType();
8840 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8841 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
8850 return DAG.
getNode(ISD::BITCAST, SL, VT, ExtLo);
8859 return DAG.
getNode(ISD::BITCAST, SL, VT, ShlHi);
8866 return DAG.
getNode(ISD::BITCAST, SL, VT,
Or);
8875 for (
unsigned P = 0;
P < NumParts; ++
P) {
8877 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
8883 return DAG.
getNode(ISD::BITCAST, SL, VT, Blend);
8896 if (!Subtarget->isAmdHsaOS())
8939 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET64,
DL, PtrVT, Ptr);
8948 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET,
DL, PtrVT, PtrLo, PtrHi);
8956 EVT PtrVT =
Op.getValueType();
8958 const GlobalValue *GV = GSD->
getGlobal();
8972 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
8987 return DAG.
getNode(AMDGPUISD::LDS,
DL, MVT::i32, GA);
8990 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8991 if (Subtarget->has64BitLiterals()) {
9022 MachinePointerInfo PtrInfo =
9050 SDValue Param = lowerKernargMemParameter(
9061 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
9069 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
9077 unsigned NumElts = Elts.
size();
9079 if (NumElts <= 12) {
9088 for (
unsigned i = 0; i < Elts.
size(); ++i) {
9094 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
9104 EVT SrcVT = Src.getValueType();
9125 bool Unpacked,
bool IsD16,
int DMaskPop,
9126 int NumVDataDwords,
bool IsAtomicPacked16Bit,
9130 EVT ReqRetVT = ResultTypes[0];
9132 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9133 ? (ReqRetNumElts + 1) / 2
9136 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9147 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
9158 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
9160 NumDataDwords - MaskPopDwords);
9165 EVT LegalReqRetVT = ReqRetVT;
9167 if (!
Data.getValueType().isInteger())
9169 Data.getValueType().changeTypeToInteger(),
Data);
9190 if (Result->getNumValues() == 1)
9197 SDValue *LWE,
bool &IsTexFail) {
9217 unsigned DimIdx,
unsigned EndIdx,
9218 unsigned NumGradients) {
9220 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
9228 if (((
I + 1) >= EndIdx) ||
9229 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
9230 I == DimIdx + NumGradients - 1))) {
9252 !
Op.getNode()->hasAnyUseOfValue(0))
9254 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9264 ResultTypes.erase(&ResultTypes[0]);
9270 int NumVDataDwords = 0;
9271 bool AdjustRetType =
false;
9272 bool IsAtomicPacked16Bit =
false;
9275 const unsigned ArgOffset = WithChain ? 2 : 1;
9278 unsigned DMaskLanes = 0;
9280 if (BaseOpcode->
Atomic) {
9281 VData =
Op.getOperand(2);
9283 IsAtomicPacked16Bit =
9284 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9285 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9286 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9287 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9298 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9300 DMask = Is64Bit ? 0xf : 0x3;
9301 NumVDataDwords = Is64Bit ? 4 : 2;
9303 DMask = Is64Bit ? 0x3 : 0x1;
9304 NumVDataDwords = Is64Bit ? 2 : 1;
9307 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
9310 if (BaseOpcode->
Store) {
9311 VData =
Op.getOperand(2);
9315 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9319 VData = handleD16VData(VData, DAG,
true);
9322 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
9323 }
else if (!BaseOpcode->
NoReturn) {
9328 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9336 (!LoadVT.
isVector() && DMaskLanes > 1))
9342 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9343 !(BaseOpcode->
Gather4 && Subtarget->hasImageGather4D16Bug()))
9344 NumVDataDwords = (DMaskLanes + 1) / 2;
9346 NumVDataDwords = DMaskLanes;
9348 AdjustRetType =
true;
9352 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
9359 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9360 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9362 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
9364 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9365 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9369 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
9375 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
9379 "Bias needs to be converted to 16 bit in A16 mode");
9384 if (BaseOpcode->
Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
9388 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
9389 "require 16 bit args for both gradients and addresses");
9394 if (!
ST->hasA16()) {
9395 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
9396 "support 16 bit addresses\n");
9406 if (BaseOpcode->
Gradients && IsG16 &&
ST->hasG16()) {
9408 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9410 IntrOpcode = G16MappingInfo->
G16;
9433 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
9451 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->
Sampler);
9452 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
9453 const bool UseNSA =
ST->hasNSAEncoding() &&
9454 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
9455 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
9456 const bool UsePartialNSA =
9457 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
9460 if (UsePartialNSA) {
9462 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9463 }
else if (!UseNSA) {
9473 uint64_t UnormConst =
9474 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
9476 Unorm = UnormConst ? True : False;
9482 bool IsTexFail =
false;
9483 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9494 NumVDataDwords += 1;
9495 AdjustRetType =
true;
9500 if (AdjustRetType) {
9503 if (DMaskLanes == 0 && !BaseOpcode->
Store) {
9512 MVT::i32, NumVDataDwords)
9515 ResultTypes[0] = NewVT;
9516 if (ResultTypes.size() == 3) {
9520 ResultTypes.erase(&ResultTypes[1]);
9534 Ops.push_back(VData);
9535 if (UsePartialNSA) {
9537 Ops.push_back(VAddr);
9541 Ops.push_back(VAddr);
9544 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9546 Ops.push_back(Rsrc);
9551 Ops.push_back(Samp);
9556 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9557 Ops.push_back(Unorm);
9559 Ops.push_back(IsA16 &&
9560 ST->hasFeature(AMDGPU::FeatureR128A16)
9564 Ops.push_back(IsA16 ? True : False);
9566 if (!Subtarget->hasGFX90AInsts())
9571 "TFE is not supported on this GPU",
DL.getDebugLoc()));
9574 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9577 Ops.push_back(DimInfo->
DA ? True : False);
9579 Ops.push_back(IsD16 ? True : False);
9581 Ops.push_back(
Op.getOperand(0));
9583 int NumVAddrDwords =
9589 NumVDataDwords, NumVAddrDwords);
9590 }
else if (IsGFX11Plus) {
9592 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9593 : AMDGPU::MIMGEncGfx11Default,
9594 NumVDataDwords, NumVAddrDwords);
9595 }
else if (IsGFX10Plus) {
9597 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9598 : AMDGPU::MIMGEncGfx10Default,
9599 NumVDataDwords, NumVAddrDwords);
9601 if (Subtarget->hasGFX90AInsts()) {
9603 NumVDataDwords, NumVAddrDwords);
9607 "requested image instruction is not supported on this GPU",
9612 for (EVT VT : OrigResultTypes) {
9613 if (VT == MVT::Other)
9614 RetValues[Idx++] =
Op.getOperand(0);
9625 NumVDataDwords, NumVAddrDwords);
9628 NumVDataDwords, NumVAddrDwords);
9635 MachineMemOperand *MemRef = MemOp->getMemOperand();
9654 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9655 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9668 MachinePointerInfo(),
9673 if (!
Offset->isDivergent()) {
9680 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9689 !Subtarget->hasScalarDwordx3Loads()) {
9693 AMDGPUISD::SBUFFER_LOAD,
DL, DAG.
getVTList(WidenedVT),
Ops, WidenedVT,
9716 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9718 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
9722 unsigned NumLoads = 1;
9728 if (NumElts == 8 || NumElts == 16) {
9729 NumLoads = NumElts / 4;
9733 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
9738 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
9740 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
9741 for (
unsigned i = 0; i < NumLoads; ++i) {
9743 Loads.
push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD,
DL, VTList,
Ops,
9747 if (NumElts == 8 || NumElts == 16)
9755 if (!Subtarget->hasArchitectedSGPRs())
9760 return DAG.
getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9767 unsigned Width)
const {
9769 using namespace AMDGPU::Hwreg;
9771 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9810 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
9812 EVT VT =
Op.getValueType();
9814 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
9818 switch (IntrinsicID) {
9819 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9822 return getPreloadedValue(DAG, *MFI, VT,
9825 case Intrinsic::amdgcn_dispatch_ptr:
9826 case Intrinsic::amdgcn_queue_ptr: {
9827 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
9829 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
9834 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9837 return getPreloadedValue(DAG, *MFI, VT, RegID);
9839 case Intrinsic::amdgcn_implicitarg_ptr: {
9841 return getImplicitArgPtr(DAG,
DL);
9842 return getPreloadedValue(DAG, *MFI, VT,
9845 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9851 return getPreloadedValue(DAG, *MFI, VT,
9854 case Intrinsic::amdgcn_dispatch_id: {
9857 case Intrinsic::amdgcn_rcp:
9858 return DAG.
getNode(AMDGPUISD::RCP,
DL, VT,
Op.getOperand(1));
9859 case Intrinsic::amdgcn_rsq:
9860 return DAG.
getNode(AMDGPUISD::RSQ,
DL, VT,
Op.getOperand(1));
9861 case Intrinsic::amdgcn_rsq_legacy:
9865 case Intrinsic::amdgcn_rcp_legacy:
9868 return DAG.
getNode(AMDGPUISD::RCP_LEGACY,
DL, VT,
Op.getOperand(1));
9869 case Intrinsic::amdgcn_rsq_clamp: {
9871 return DAG.
getNode(AMDGPUISD::RSQ_CLAMP,
DL, VT,
Op.getOperand(1));
9880 return DAG.
getNode(ISD::FMAXNUM,
DL, VT, Tmp,
9883 case Intrinsic::r600_read_ngroups_x:
9884 if (Subtarget->isAmdHsaOS())
9887 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9890 case Intrinsic::r600_read_ngroups_y:
9891 if (Subtarget->isAmdHsaOS())
9894 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9897 case Intrinsic::r600_read_ngroups_z:
9898 if (Subtarget->isAmdHsaOS())
9901 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9904 case Intrinsic::r600_read_local_size_x:
9905 if (Subtarget->isAmdHsaOS())
9908 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9910 case Intrinsic::r600_read_local_size_y:
9911 if (Subtarget->isAmdHsaOS())
9914 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9916 case Intrinsic::r600_read_local_size_z:
9917 if (Subtarget->isAmdHsaOS())
9920 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9922 case Intrinsic::amdgcn_workgroup_id_x:
9923 return lowerWorkGroupId(DAG, *MFI, VT,
9927 case Intrinsic::amdgcn_workgroup_id_y:
9928 return lowerWorkGroupId(DAG, *MFI, VT,
9932 case Intrinsic::amdgcn_workgroup_id_z:
9933 return lowerWorkGroupId(DAG, *MFI, VT,
9937 case Intrinsic::amdgcn_cluster_id_x:
9938 return Subtarget->hasClusters()
9939 ? getPreloadedValue(DAG, *MFI, VT,
9941 : DAG.getPOISON(VT);
9942 case Intrinsic::amdgcn_cluster_id_y:
9943 return Subtarget->hasClusters()
9944 ? getPreloadedValue(DAG, *MFI, VT,
9947 case Intrinsic::amdgcn_cluster_id_z:
9948 return Subtarget->hasClusters()
9949 ? getPreloadedValue(DAG, *MFI, VT,
9952 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9953 return Subtarget->hasClusters()
9954 ? getPreloadedValue(
9958 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9959 return Subtarget->hasClusters()
9960 ? getPreloadedValue(
9964 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9965 return Subtarget->hasClusters()
9966 ? getPreloadedValue(
9970 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9971 return Subtarget->hasClusters()
9974 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9975 return Subtarget->hasClusters()
9976 ? getPreloadedValue(
9980 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9981 return Subtarget->hasClusters()
9982 ? getPreloadedValue(
9986 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9987 return Subtarget->hasClusters()
9988 ? getPreloadedValue(
9992 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9993 return Subtarget->hasClusters()
9994 ? getPreloadedValue(
9998 case Intrinsic::amdgcn_wave_id:
9999 return lowerWaveID(DAG,
Op);
10000 case Intrinsic::amdgcn_lds_kernel_id: {
10002 return getLDSKernelId(DAG,
DL);
10003 return getPreloadedValue(DAG, *MFI, VT,
10006 case Intrinsic::amdgcn_workitem_id_x:
10007 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
10008 case Intrinsic::amdgcn_workitem_id_y:
10009 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
10010 case Intrinsic::amdgcn_workitem_id_z:
10011 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
10012 case Intrinsic::amdgcn_wavefrontsize:
10014 SDLoc(
Op), MVT::i32);
10015 case Intrinsic::amdgcn_s_buffer_load: {
10016 unsigned CPol =
Op.getConstantOperandVal(3);
10023 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
10024 Op.getOperand(3), DAG);
10026 case Intrinsic::amdgcn_fdiv_fast:
10027 return lowerFDIV_FAST(
Op, DAG);
10028 case Intrinsic::amdgcn_sin:
10029 return DAG.
getNode(AMDGPUISD::SIN_HW,
DL, VT,
Op.getOperand(1));
10031 case Intrinsic::amdgcn_cos:
10032 return DAG.
getNode(AMDGPUISD::COS_HW,
DL, VT,
Op.getOperand(1));
10034 case Intrinsic::amdgcn_mul_u24:
10035 return DAG.
getNode(AMDGPUISD::MUL_U24,
DL, VT,
Op.getOperand(1),
10037 case Intrinsic::amdgcn_mul_i24:
10038 return DAG.
getNode(AMDGPUISD::MUL_I24,
DL, VT,
Op.getOperand(1),
10041 case Intrinsic::amdgcn_log_clamp: {
10047 case Intrinsic::amdgcn_fract:
10048 return DAG.
getNode(AMDGPUISD::FRACT,
DL, VT,
Op.getOperand(1));
10050 case Intrinsic::amdgcn_class:
10051 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, VT,
Op.getOperand(1),
10053 case Intrinsic::amdgcn_div_fmas:
10054 return DAG.
getNode(AMDGPUISD::DIV_FMAS,
DL, VT,
Op.getOperand(1),
10055 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10057 case Intrinsic::amdgcn_div_fixup:
10058 return DAG.
getNode(AMDGPUISD::DIV_FIXUP,
DL, VT,
Op.getOperand(1),
10059 Op.getOperand(2),
Op.getOperand(3));
10061 case Intrinsic::amdgcn_div_scale: {
10067 SDValue Denominator =
Op.getOperand(2);
10074 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
10076 return DAG.
getNode(AMDGPUISD::DIV_SCALE,
DL,
Op->getVTList(), Src0,
10077 Denominator, Numerator);
10079 case Intrinsic::amdgcn_icmp: {
10081 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
10082 Op.getConstantOperandVal(2) == 0 &&
10087 case Intrinsic::amdgcn_fcmp: {
10090 case Intrinsic::amdgcn_ballot:
10092 case Intrinsic::amdgcn_fmed3:
10093 return DAG.
getNode(AMDGPUISD::FMED3,
DL, VT,
Op.getOperand(1),
10094 Op.getOperand(2),
Op.getOperand(3));
10095 case Intrinsic::amdgcn_fdot2:
10096 return DAG.
getNode(AMDGPUISD::FDOT2,
DL, VT,
Op.getOperand(1),
10097 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10098 case Intrinsic::amdgcn_fmul_legacy:
10099 return DAG.
getNode(AMDGPUISD::FMUL_LEGACY,
DL, VT,
Op.getOperand(1),
10101 case Intrinsic::amdgcn_sffbh:
10102 return DAG.
getNode(AMDGPUISD::FFBH_I32,
DL, VT,
Op.getOperand(1));
10103 case Intrinsic::amdgcn_sbfe:
10104 return DAG.
getNode(AMDGPUISD::BFE_I32,
DL, VT,
Op.getOperand(1),
10105 Op.getOperand(2),
Op.getOperand(3));
10106 case Intrinsic::amdgcn_ubfe:
10107 return DAG.
getNode(AMDGPUISD::BFE_U32,
DL, VT,
Op.getOperand(1),
10108 Op.getOperand(2),
Op.getOperand(3));
10109 case Intrinsic::amdgcn_cvt_pkrtz:
10110 case Intrinsic::amdgcn_cvt_pknorm_i16:
10111 case Intrinsic::amdgcn_cvt_pknorm_u16:
10112 case Intrinsic::amdgcn_cvt_pk_i16:
10113 case Intrinsic::amdgcn_cvt_pk_u16: {
10115 EVT VT =
Op.getValueType();
10118 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10119 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10120 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10121 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10122 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10123 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10124 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10125 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10127 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10130 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
10133 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
10134 return DAG.
getNode(ISD::BITCAST,
DL, VT, Node);
10136 case Intrinsic::amdgcn_fmad_ftz:
10137 return DAG.
getNode(AMDGPUISD::FMAD_FTZ,
DL, VT,
Op.getOperand(1),
10138 Op.getOperand(2),
Op.getOperand(3));
10140 case Intrinsic::amdgcn_if_break:
10142 Op->getOperand(1),
Op->getOperand(2)),
10145 case Intrinsic::amdgcn_groupstaticsize: {
10151 const GlobalValue *GV =
10157 case Intrinsic::amdgcn_is_shared:
10158 case Intrinsic::amdgcn_is_private: {
10161 DAG.
getNode(ISD::BITCAST,
DL, MVT::v2i32,
Op.getOperand(1));
10165 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10169 Subtarget->hasGloballyAddressableScratch()) {
10172 AMDGPU::S_MOV_B32,
DL, MVT::i32,
10173 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10182 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10185 case Intrinsic::amdgcn_perm:
10186 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op.getOperand(1),
10187 Op.getOperand(2),
Op.getOperand(3));
10188 case Intrinsic::amdgcn_reloc_constant: {
10198 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10199 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10200 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10201 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10202 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10203 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10204 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10205 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10206 if (
Op.getOperand(4).getValueType() == MVT::i32)
10212 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10213 Op.getOperand(3), IndexKeyi32);
10215 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10216 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10217 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10218 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10219 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10220 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10221 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10222 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10223 if (
Op.getOperand(4).getValueType() == MVT::i64)
10229 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10230 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10231 Op.getOperand(6)});
10233 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10234 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10235 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10236 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10237 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10238 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10239 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10242 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
10248 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10249 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10250 IndexKey, Op.getOperand(7),
10251 Op.getOperand(8)});
10253 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10254 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10255 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10256 if (
Op.getOperand(6).getValueType() == MVT::i32)
10262 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10263 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10264 IndexKeyi32, Op.getOperand(7)});
10266 case Intrinsic::amdgcn_addrspacecast_nonnull:
10267 return lowerADDRSPACECAST(
Op, DAG);
10268 case Intrinsic::amdgcn_readlane:
10269 case Intrinsic::amdgcn_readfirstlane:
10270 case Intrinsic::amdgcn_writelane:
10271 case Intrinsic::amdgcn_permlane16:
10272 case Intrinsic::amdgcn_permlanex16:
10273 case Intrinsic::amdgcn_permlane64:
10274 case Intrinsic::amdgcn_set_inactive:
10275 case Intrinsic::amdgcn_set_inactive_chain_arg:
10276 case Intrinsic::amdgcn_mov_dpp8:
10277 case Intrinsic::amdgcn_update_dpp:
10279 case Intrinsic::amdgcn_dead: {
10281 for (
const EVT ValTy :
Op.getNode()->values())
10286 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10288 return lowerImage(
Op, ImageDimIntr, DAG,
false);
10299 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10305 unsigned NewOpcode)
const {
10309 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10310 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10328 M->getMemOperand());
10333 unsigned NewOpcode)
const {
10337 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10338 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10356 M->getMemOperand());
10361 unsigned IntrID =
Op.getConstantOperandVal(1);
10365 case Intrinsic::amdgcn_ds_ordered_add:
10366 case Intrinsic::amdgcn_ds_ordered_swap: {
10371 unsigned IndexOperand =
M->getConstantOperandVal(7);
10372 unsigned WaveRelease =
M->getConstantOperandVal(8);
10373 unsigned WaveDone =
M->getConstantOperandVal(9);
10375 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10376 IndexOperand &= ~0x3f;
10377 unsigned CountDw = 0;
10380 CountDw = (IndexOperand >> 24) & 0xf;
10381 IndexOperand &= ~(0xf << 24);
10383 if (CountDw < 1 || CountDw > 4) {
10386 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
10387 DL.getDebugLoc()));
10392 if (IndexOperand) {
10395 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
10398 if (WaveDone && !WaveRelease) {
10402 Fn,
"ds_ordered_count: wave_done requires wave_release",
10403 DL.getDebugLoc()));
10406 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10407 unsigned ShaderType =
10409 unsigned Offset0 = OrderedCountIndex << 2;
10410 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10413 Offset1 |= (CountDw - 1) << 6;
10416 Offset1 |= ShaderType << 2;
10418 unsigned Offset = Offset0 | (Offset1 << 8);
10425 M->getVTList(),
Ops,
M->getMemoryVT(),
10426 M->getMemOperand());
10428 case Intrinsic::amdgcn_raw_buffer_load:
10429 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10430 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10431 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10432 case Intrinsic::amdgcn_raw_buffer_load_format:
10433 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10434 const bool IsFormat =
10435 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10436 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10438 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10439 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10453 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
10455 case Intrinsic::amdgcn_struct_buffer_load:
10456 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10457 case Intrinsic::amdgcn_struct_buffer_load_format:
10458 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10459 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10460 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10461 const bool IsFormat =
10462 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10463 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10465 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10466 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10481 case Intrinsic::amdgcn_raw_tbuffer_load:
10482 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10484 EVT LoadVT =
Op.getValueType();
10485 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10486 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10502 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10504 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
10505 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10508 case Intrinsic::amdgcn_struct_tbuffer_load:
10509 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10511 EVT LoadVT =
Op.getValueType();
10512 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10513 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10529 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10531 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
10532 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10535 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10536 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10537 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10538 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10539 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10540 return lowerStructBufferAtomicIntrin(
Op, DAG,
10541 AMDGPUISD::BUFFER_ATOMIC_FADD);
10542 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10543 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10544 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10545 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10546 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10547 return lowerStructBufferAtomicIntrin(
Op, DAG,
10548 AMDGPUISD::BUFFER_ATOMIC_FMIN);
10549 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10550 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10551 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10552 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10553 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10554 return lowerStructBufferAtomicIntrin(
Op, DAG,
10555 AMDGPUISD::BUFFER_ATOMIC_FMAX);
10556 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10557 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10558 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10559 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10560 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10561 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10562 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10563 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10564 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10565 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10566 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10567 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10568 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10569 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10570 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10571 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10572 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10573 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10574 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10575 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10576 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10577 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10578 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10579 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10580 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10581 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10582 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10583 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10584 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10585 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10586 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10587 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10588 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10589 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10590 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10591 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10592 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10593 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10594 return lowerStructBufferAtomicIntrin(
Op, DAG,
10595 AMDGPUISD::BUFFER_ATOMIC_SWAP);
10596 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10597 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10598 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10599 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10600 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10601 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10602 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10603 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10604 return lowerStructBufferAtomicIntrin(
Op, DAG,
10605 AMDGPUISD::BUFFER_ATOMIC_SMIN);
10606 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10607 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10608 return lowerStructBufferAtomicIntrin(
Op, DAG,
10609 AMDGPUISD::BUFFER_ATOMIC_UMIN);
10610 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10611 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10612 return lowerStructBufferAtomicIntrin(
Op, DAG,
10613 AMDGPUISD::BUFFER_ATOMIC_SMAX);
10614 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10615 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10616 return lowerStructBufferAtomicIntrin(
Op, DAG,
10617 AMDGPUISD::BUFFER_ATOMIC_UMAX);
10618 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10619 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10620 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10621 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10622 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10623 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10624 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10625 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10626 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10627 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10628 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10629 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10630 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10631 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10632 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10633 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10634 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10635 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
10636 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10637 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10638 return lowerStructBufferAtomicIntrin(
Op, DAG,
10639 AMDGPUISD::BUFFER_ATOMIC_CSUB);
10640 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10641 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10642 return lowerRawBufferAtomicIntrin(
Op, DAG,
10643 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10644 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10645 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10646 return lowerStructBufferAtomicIntrin(
Op, DAG,
10647 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10648 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10649 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10650 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
10651 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10665 EVT VT =
Op.getValueType();
10669 Op->getVTList(),
Ops, VT,
10670 M->getMemOperand());
10672 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10673 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10674 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
10675 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
10689 EVT VT =
Op.getValueType();
10693 Op->getVTList(),
Ops, VT,
10694 M->getMemOperand());
10696 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10697 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10699 SDValue NodePtr =
M->getOperand(2);
10700 SDValue RayExtent =
M->getOperand(3);
10701 SDValue InstanceMask =
M->getOperand(4);
10702 SDValue RayOrigin =
M->getOperand(5);
10703 SDValue RayDir =
M->getOperand(6);
10705 SDValue TDescr =
M->getOperand(8);
10710 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10715 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10716 const unsigned NumVDataDwords = 10;
10717 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10719 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10720 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10721 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10725 Ops.push_back(NodePtr);
10728 {DAG.getBitcast(MVT::i32, RayExtent),
10729 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10730 Ops.push_back(RayOrigin);
10731 Ops.push_back(RayDir);
10732 Ops.push_back(Offsets);
10733 Ops.push_back(TDescr);
10734 Ops.push_back(
M->getChain());
10737 MachineMemOperand *MemRef =
M->getMemOperand();
10741 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10743 SDValue NodePtr =
M->getOperand(2);
10744 SDValue RayExtent =
M->getOperand(3);
10745 SDValue RayOrigin =
M->getOperand(4);
10746 SDValue RayDir =
M->getOperand(5);
10747 SDValue RayInvDir =
M->getOperand(6);
10748 SDValue TDescr =
M->getOperand(7);
10755 if (!Subtarget->hasGFX10_AEncoding()) {
10765 const unsigned NumVDataDwords = 4;
10766 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10767 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10768 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10771 const unsigned BaseOpcodes[2][2] = {
10772 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10773 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10774 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10778 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10779 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10780 : AMDGPU::MIMGEncGfx10NSA,
10781 NumVDataDwords, NumVAddrDwords);
10785 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10786 : AMDGPU::MIMGEncGfx10Default,
10787 NumVDataDwords, NumVAddrDwords);
10793 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
10796 if (Lanes[0].getValueSizeInBits() == 32) {
10797 for (
unsigned I = 0;
I < 3; ++
I)
10804 Ops.push_back(Lanes[2]);
10816 if (UseNSA && IsGFX11Plus) {
10817 Ops.push_back(NodePtr);
10819 Ops.push_back(RayOrigin);
10824 for (
unsigned I = 0;
I < 3; ++
I) {
10827 {DirLanes[I], InvDirLanes[I]})));
10831 Ops.push_back(RayDir);
10832 Ops.push_back(RayInvDir);
10839 Ops.push_back(NodePtr);
10842 packLanes(RayOrigin,
true);
10843 packLanes(RayDir,
true);
10844 packLanes(RayInvDir,
false);
10849 if (NumVAddrDwords > 12) {
10851 Ops.append(16 -
Ops.size(), Undef);
10857 Ops.push_back(MergedOps);
10860 Ops.push_back(TDescr);
10862 Ops.push_back(
M->getChain());
10865 MachineMemOperand *MemRef =
M->getMemOperand();
10869 case Intrinsic::amdgcn_global_atomic_fmin_num:
10870 case Intrinsic::amdgcn_global_atomic_fmax_num:
10871 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10872 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10879 unsigned Opcode = 0;
10881 case Intrinsic::amdgcn_global_atomic_fmin_num:
10882 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10883 Opcode = ISD::ATOMIC_LOAD_FMIN;
10886 case Intrinsic::amdgcn_global_atomic_fmax_num:
10887 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10888 Opcode = ISD::ATOMIC_LOAD_FMAX;
10894 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
10895 Ops,
M->getMemOperand());
10897 case Intrinsic::amdgcn_s_get_barrier_state:
10898 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10905 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10906 BarID = (BarID >> 4) & 0x3F;
10907 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10910 Ops.push_back(Chain);
10912 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10913 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10921 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
10929 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10930 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10931 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10935 EVT VT =
Op->getValueType(0);
10941 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10943 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10951SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
10958 EVT VT = VTList.
VTs[0];
10961 bool IsTFE = VTList.
NumVTs == 3;
10964 unsigned NumOpDWords = NumValueDWords + 1;
10966 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
10967 MachineMemOperand *OpDWordsMMO =
10969 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
10970 OpDWordsVT, OpDWordsMMO, DAG);
10975 NumValueDWords == 1
10984 if (!Subtarget->hasDwordx3LoadStores() &&
10985 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10989 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
10991 WidenedMemVT, WidenedMMO);
11001 bool ImageStore)
const {
11011 if (Subtarget->hasUnpackedD16VMem()) {
11025 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11036 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
11042 if ((NumElements % 2) == 1) {
11044 unsigned I = Elts.
size() / 2;
11060 if (NumElements == 3) {
11070 return DAG.
getNode(ISD::BITCAST,
DL, WidenedStoreVT, ZExt);
11081 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
11084 switch (IntrinsicID) {
11085 case Intrinsic::amdgcn_exp_compr: {
11086 if (!Subtarget->hasCompressedExport()) {
11089 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
11101 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src0),
11102 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src1),
11111 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11115 case Intrinsic::amdgcn_struct_tbuffer_store:
11116 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11118 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11120 VData = handleD16VData(VData, DAG);
11121 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11122 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11136 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11137 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11140 M->getMemoryVT(),
M->getMemOperand());
11143 case Intrinsic::amdgcn_raw_tbuffer_store:
11144 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11146 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11148 VData = handleD16VData(VData, DAG);
11149 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11150 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11164 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11165 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11168 M->getMemoryVT(),
M->getMemOperand());
11171 case Intrinsic::amdgcn_raw_buffer_store:
11172 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11173 case Intrinsic::amdgcn_raw_buffer_store_format:
11174 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11175 const bool IsFormat =
11176 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11177 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11184 VData = handleD16VData(VData, DAG);
11194 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11195 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11209 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11210 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11215 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
11218 M->getMemoryVT(),
M->getMemOperand());
11221 case Intrinsic::amdgcn_struct_buffer_store:
11222 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11223 case Intrinsic::amdgcn_struct_buffer_store_format:
11224 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11225 const bool IsFormat =
11226 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11227 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11235 VData = handleD16VData(VData, DAG);
11245 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11246 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11260 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11261 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11265 EVT VDataType = VData.getValueType().getScalarType();
11267 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
11270 M->getMemoryVT(),
M->getMemOperand());
11272 case Intrinsic::amdgcn_raw_buffer_load_lds:
11273 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11274 case Intrinsic::amdgcn_struct_buffer_load_lds:
11275 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11276 if (!Subtarget->hasVMemToLDSLoad())
11280 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11281 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11282 unsigned OpOffset = HasVIndex ? 1 : 0;
11283 SDValue VOffset =
Op.getOperand(5 + OpOffset);
11285 unsigned Size =
Op->getConstantOperandVal(4);
11291 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11292 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11293 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11294 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11297 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11298 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11299 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11300 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11303 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11304 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11305 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11306 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11309 if (!Subtarget->hasLDSLoadB96_B128())
11311 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11312 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11313 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11314 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11317 if (!Subtarget->hasLDSLoadB96_B128())
11319 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11320 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11321 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11322 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11330 if (HasVIndex && HasVOffset)
11334 else if (HasVIndex)
11335 Ops.push_back(
Op.getOperand(5));
11336 else if (HasVOffset)
11337 Ops.push_back(VOffset);
11339 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11340 Ops.push_back(Rsrc);
11341 Ops.push_back(
Op.getOperand(6 + OpOffset));
11342 Ops.push_back(
Op.getOperand(7 + OpOffset));
11344 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
11357 MachineMemOperand *LoadMMO =
M->getMemOperand();
11362 MachinePointerInfo StorePtrI = LoadPtrI;
11386 case Intrinsic::amdgcn_load_to_lds:
11387 case Intrinsic::amdgcn_global_load_lds: {
11388 if (!Subtarget->hasVMemToLDSLoad())
11392 unsigned Size =
Op->getConstantOperandVal(4);
11397 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11400 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11403 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11406 if (!Subtarget->hasLDSLoadB96_B128())
11408 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11411 if (!Subtarget->hasLDSLoadB96_B128())
11413 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11429 if (
LHS->isDivergent())
11433 RHS.getOperand(0).getValueType() == MVT::i32) {
11436 VOffset =
RHS.getOperand(0);
11440 Ops.push_back(Addr);
11448 Ops.push_back(VOffset);
11451 Ops.push_back(
Op.getOperand(5));
11453 unsigned Aux =
Op.getConstantOperandVal(6);
11461 MachineMemOperand *LoadMMO =
M->getMemOperand();
11463 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
11464 MachinePointerInfo StorePtrI = LoadPtrI;
11483 case Intrinsic::amdgcn_end_cf:
11485 Op->getOperand(2), Chain),
11487 case Intrinsic::amdgcn_s_barrier_init:
11488 case Intrinsic::amdgcn_s_barrier_signal_var: {
11495 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11496 ? AMDGPU::S_BARRIER_INIT_M0
11497 : AMDGPU::S_BARRIER_SIGNAL_M0;
11512 constexpr unsigned ShAmt = 16;
11519 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11524 case Intrinsic::amdgcn_s_wakeup_barrier: {
11525 if (!Subtarget->hasSWakeupBarrier())
11529 case Intrinsic::amdgcn_s_barrier_join: {
11538 switch (IntrinsicID) {
11541 case Intrinsic::amdgcn_s_barrier_join:
11542 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11544 case Intrinsic::amdgcn_s_wakeup_barrier:
11545 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11549 unsigned BarID = (BarVal >> 4) & 0x3F;
11552 Ops.push_back(Chain);
11554 switch (IntrinsicID) {
11557 case Intrinsic::amdgcn_s_barrier_join:
11558 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11560 case Intrinsic::amdgcn_s_wakeup_barrier:
11561 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11572 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11578 case Intrinsic::amdgcn_s_prefetch_data: {
11581 return Op.getOperand(0);
11584 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11586 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
11593 Op->getVTList(),
Ops,
M->getMemoryVT(),
11594 M->getMemOperand());
11596 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11597 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11598 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11607 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11609 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11625 return PtrVT == MVT::i64;
11639std::pair<SDValue, SDValue>
11669 unsigned Overflow = ImmOffset & ~MaxImm;
11670 ImmOffset -= Overflow;
11671 if ((int32_t)Overflow < 0) {
11672 Overflow += ImmOffset;
11677 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
11696void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
11698 Align Alignment)
const {
11700 SDLoc
DL(CombinedOffset);
11702 uint32_t
Imm =
C->getZExtValue();
11703 uint32_t SOffset, ImmOffset;
11704 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11714 uint32_t SOffset, ImmOffset;
11717 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
11725 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11734SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
11737 return MaybePointer;
11751 SDValue NumRecords =
Op->getOperand(3);
11757 if (Subtarget->has45BitNumRecordsBufferResource()) {
11776 SDValue ExtShiftedStrideVec =
11779 DAG.
getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
11786 DAG.
getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
11788 DAG.
getNode(
ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11790 DAG.
getNode(
ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11795 auto [LowHalf, HighHalf] =
11796 DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11806 NumRecords, Flags);
11809 SDValue RsrcPtr = DAG.
getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11818 bool IsTFE)
const {
11823 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
11824 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
11827 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
11839 ? AMDGPUISD::BUFFER_LOAD_UBYTE
11840 : AMDGPUISD::BUFFER_LOAD_USHORT;
11842 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
11846 LoadVal = DAG.
getNode(ISD::BITCAST,
DL, LoadVT, LoadVal);
11856 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11860 Ops[1] = BufferStoreExt;
11861 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11862 : AMDGPUISD::BUFFER_STORE_SHORT;
11865 M->getMemOperand());
11890 DAGCombinerInfo &DCI)
const {
11891 SelectionDAG &DAG = DCI.DAG;
11906 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
11913 "unexpected vector extload");
11926 "unexpected fp extload");
11944 DCI.AddToWorklist(Cvt.
getNode());
11949 DCI.AddToWorklist(Cvt.
getNode());
11952 Cvt = DAG.
getNode(ISD::BITCAST, SL, VT, Cvt);
11960 if (
Info.isEntryFunction())
11961 return Info.getUserSGPRInfo().hasFlatScratchInit();
11969 EVT MemVT =
Load->getMemoryVT();
11970 MachineMemOperand *MMO =
Load->getMemOperand();
11982 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12010 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
12011 "Custom lowering for non-i32 vectors hasn't been implemented.");
12014 unsigned AS =
Load->getAddressSpace();
12021 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12025 !Subtarget->hasMultiDwordFlatScratchAddressing())
12035 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
12038 Alignment >=
Align(4) && NumElements < 32) {
12040 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12052 if (NumElements > 4)
12055 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12065 switch (Subtarget->getMaxPrivateElementSize()) {
12071 if (NumElements > 2)
12076 if (NumElements > 4)
12079 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12088 auto Flags =
Load->getMemOperand()->getFlags();
12090 Load->getAlign(), Flags, &
Fast) &&
12099 MemVT, *
Load->getMemOperand())) {
12108 EVT VT =
Op.getValueType();
12135 return DAG.
getNode(ISD::BITCAST,
DL, VT, Res);
12145 EVT VT =
Op.getValueType();
12146 const SDNodeFlags
Flags =
Op->getFlags();
12148 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
12154 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12157 if (CLHS->isExactlyValue(1.0)) {
12170 return DAG.
getNode(AMDGPUISD::RCP, SL, VT,
RHS);
12174 if (CLHS->isExactlyValue(-1.0)) {
12177 return DAG.
getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12183 if (!AllowInaccurateRcp &&
12184 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
12198 EVT VT =
Op.getValueType();
12199 const SDNodeFlags
Flags =
Op->getFlags();
12201 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
12202 if (!AllowInaccurateDiv)
12223 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
12233 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12237 return DAG.
getNode(Opcode, SL, VTList,
12246 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
12256 Opcode = AMDGPUISD::FMA_W_CHAIN;
12260 return DAG.
getNode(Opcode, SL, VTList,
12266 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12267 return FastLowered;
12270 EVT VT =
Op.getValueType();
12277 if (VT == MVT::bf16) {
12300 unsigned FMADOpCode =
12302 SDValue NegRHSExt = DAG.
getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12304 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt,
Op->getFlags());
12307 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12309 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
12310 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12316 Tmp = DAG.
getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12320 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst,
RHS,
LHS,
12326 SDNodeFlags
Flags =
Op->getFlags();
12333 const APFloat K0Val(0x1p+96f);
12336 const APFloat K1Val(0x1p-32f);
12363 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
12364 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
12365 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12370 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12371 return FastLowered;
12377 SDNodeFlags
Flags =
Op->getFlags();
12378 Flags.setNoFPExcept(
true);
12386 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
12395 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12397 DAG.
getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12399 using namespace AMDGPU::Hwreg;
12400 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12404 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
12405 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
12408 const bool HasDynamicDenormals =
12414 if (!PreservesDenormals) {
12419 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12422 if (HasDynamicDenormals) {
12426 SavedDenormMode =
SDValue(GetReg, 0);
12432 SDNode *EnableDenorm;
12433 if (Subtarget->hasDenormModeInst()) {
12434 const SDValue EnableDenormValue =
12437 EnableDenorm = DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12441 const SDValue EnableDenormValue =
12443 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12444 {EnableDenormValue,
BitField, Glue});
12454 ApproxRcp, One, NegDivScale0, Flags);
12457 ApproxRcp, Fma0, Flags);
12463 NumeratorScaled,
Mul, Flags);
12469 NumeratorScaled, Fma3, Flags);
12471 if (!PreservesDenormals) {
12472 SDNode *DisableDenorm;
12473 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12477 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12479 DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12483 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
12484 const SDValue DisableDenormValue =
12485 HasDynamicDenormals
12490 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12501 {Fma4, Fma1, Fma3, Scale},
Flags);
12503 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas,
RHS,
LHS, Flags);
12507 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
12508 return FastLowered;
12516 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
12520 SDValue NegDivScale0 = DAG.
getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12522 SDValue Rcp = DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12540 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12549 SDValue Scale0BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12550 SDValue Scale1BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12570 DAG.
getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3,
Mul, Scale);
12572 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas,
Y,
X);
12576 EVT VT =
Op.getValueType();
12578 if (VT == MVT::f32)
12579 return LowerFDIV32(
Op, DAG);
12581 if (VT == MVT::f64)
12582 return LowerFDIV64(
Op, DAG);
12584 if (VT == MVT::f16 || VT == MVT::bf16)
12585 return LowerFDIV16(
Op, DAG);
12594 EVT ResultExpVT =
Op->getValueType(1);
12595 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12605 if (Subtarget->hasFractBug()) {
12623 EVT VT =
Store->getMemoryVT();
12625 if (VT == MVT::i1) {
12629 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
12633 Store->getValue().getValueType().getScalarType() == MVT::i32);
12635 unsigned AS =
Store->getAddressSpace();
12643 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12647 !Subtarget->hasMultiDwordFlatScratchAddressing())
12654 if (NumElements > 4)
12657 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12661 VT, *
Store->getMemOperand()))
12667 switch (Subtarget->getMaxPrivateElementSize()) {
12671 if (NumElements > 2)
12675 if (NumElements > 4 ||
12676 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12684 auto Flags =
Store->getMemOperand()->getFlags();
12703 assert(!Subtarget->has16BitInsts());
12704 SDNodeFlags
Flags =
Op->getFlags();
12706 DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32,
Op.getOperand(0), Flags);
12718 SDNodeFlags
Flags =
Op->getFlags();
12719 MVT VT =
Op.getValueType().getSimpleVT();
12749 SDValue SqrtSNextDown = DAG.
getNode(ISD::BITCAST,
DL, VT, SqrtSNextDownInt);
12752 DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextDown, Flags);
12761 SDValue NegSqrtSNextUp = DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextUp, Flags);
12827 SDNodeFlags
Flags =
Op->getFlags();
12873 SqrtRet = DAG.
getNode(ISD::FLDEXP,
DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12890 EVT VT =
Op.getValueType();
12900 if (Subtarget->hasTrigReducedRange()) {
12902 TrigVal = DAG.
getNode(AMDGPUISD::FRACT,
DL, VT, MulVal, Flags);
12907 switch (
Op.getOpcode()) {
12909 return DAG.
getNode(AMDGPUISD::COS_HW, SDLoc(
Op), VT, TrigVal, Flags);
12911 return DAG.
getNode(AMDGPUISD::SIN_HW, SDLoc(
Op), VT, TrigVal, Flags);
12934 EVT VT =
Op.getValueType();
12942 Op->getVTList(),
Ops, VT,
12951SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
12952 DAGCombinerInfo &DCI)
const {
12953 EVT VT =
N->getValueType(0);
12955 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12958 SelectionDAG &DAG = DCI.DAG;
12962 EVT SrcVT = Src.getValueType();
12968 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12971 DCI.AddToWorklist(Cvt.
getNode());
12974 if (ScalarVT != MVT::f32) {
12986 DAGCombinerInfo &DCI)
const {
12993 if (SignOp.
getOpcode() == ISD::FP_EXTEND ||
12997 SelectionDAG &DAG = DCI.DAG;
13016 for (
unsigned I = 0;
I != NumElts; ++
I) {
13040 if (NewElts.
size() == 1)
13062 for (
unsigned I = 0;
I != NumElts; ++
I) {
13097SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
13099 DAGCombinerInfo &DCI)
const {
13116 SelectionDAG &DAG = DCI.DAG;
13129 AM.BaseOffs =
Offset.getSExtValue();
13134 EVT VT =
N->getValueType(0);
13140 Flags.setNoUnsignedWrap(
13141 N->getFlags().hasNoUnsignedWrap() &&
13153 switch (
N->getOpcode()) {
13164 DAGCombinerInfo &DCI)
const {
13165 SelectionDAG &DAG = DCI.DAG;
13172 SDValue NewPtr = performSHLPtrCombine(Ptr.
getNode(),
N->getAddressSpace(),
13173 N->getMemoryVT(), DCI);
13177 NewOps[PtrIdx] = NewPtr;
13186 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13187 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13196SDValue SITargetLowering::splitBinaryBitConstantOp(
13200 uint32_t ValLo =
Lo_32(Val);
13201 uint32_t ValHi =
Hi_32(Val);
13208 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
13222 if (V.getValueType() != MVT::i1)
13224 switch (V.getOpcode()) {
13229 case AMDGPUISD::FP_CLASS:
13241 return V.getResNo() == 1;
13243 unsigned IntrinsicID = V.getConstantOperandVal(0);
13244 switch (IntrinsicID) {
13245 case Intrinsic::amdgcn_is_shared:
13246 case Intrinsic::amdgcn_is_private:
13263 if (!(
C & 0x000000ff))
13264 ZeroByteMask |= 0x000000ff;
13265 if (!(
C & 0x0000ff00))
13266 ZeroByteMask |= 0x0000ff00;
13267 if (!(
C & 0x00ff0000))
13268 ZeroByteMask |= 0x00ff0000;
13269 if (!(
C & 0xff000000))
13270 ZeroByteMask |= 0xff000000;
13271 uint32_t NonZeroByteMask = ~ZeroByteMask;
13272 if ((NonZeroByteMask &
C) != NonZeroByteMask)
13285 assert(V.getValueSizeInBits() == 32);
13287 if (V.getNumOperands() != 2)
13296 switch (V.getOpcode()) {
13301 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13306 return (0x03020100 & ~ConstMask) | ConstMask;
13313 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
13319 return uint32_t(0x0c0c0c0c03020100ull >>
C);
13326 DAGCombinerInfo &DCI)
const {
13327 if (DCI.isBeforeLegalize())
13330 SelectionDAG &DAG = DCI.DAG;
13331 EVT VT =
N->getValueType(0);
13336 if (VT == MVT::i64 && CRHS) {
13338 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
13342 if (CRHS && VT == MVT::i32) {
13352 unsigned Shift = CShift->getZExtValue();
13354 unsigned Offset = NB + Shift;
13355 if ((
Offset & (Bits - 1)) == 0) {
13358 DAG.
getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
LHS->getOperand(0),
13379 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13381 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
13394 if (
Y.getOpcode() != ISD::FABS ||
Y.getOperand(0) !=
X ||
13399 if (
X !=
LHS.getOperand(1))
13403 const ConstantFPSDNode *C1 =
13420 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
X,
13426 if (
RHS.getOpcode() ==
ISD::SETCC &&
LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13429 if (
LHS.getOpcode() ==
ISD::SETCC &&
RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13437 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
13438 LHS.getOperand(0) ==
LHS.getOperand(1))) {
13440 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
13441 :
Mask->getZExtValue() & OrdMask;
13444 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
RHS.getOperand(0),
13462 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13465 if (LHSMask != ~0u && RHSMask != ~0u) {
13468 if (LHSMask > RHSMask) {
13475 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13476 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13479 if (!(LHSUsedLanes & RHSUsedLanes) &&
13482 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13488 uint32_t
Mask = LHSMask & RHSMask;
13489 for (
unsigned I = 0;
I < 32;
I += 8) {
13490 uint32_t ByteSel = 0xff <<
I;
13491 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13492 Mask &= (0x0c <<
I) & 0xffffffff;
13497 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
13500 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
13550static const std::optional<ByteProvider<SDValue>>
13552 unsigned Depth = 0) {
13555 return std::nullopt;
13557 if (
Op.getValueSizeInBits() < 8)
13558 return std::nullopt;
13560 if (
Op.getValueType().isVector())
13563 switch (
Op->getOpcode()) {
13575 NarrowVT = VTSign->getVT();
13578 return std::nullopt;
13581 if (SrcIndex >= NarrowByteWidth)
13582 return std::nullopt;
13590 return std::nullopt;
13592 uint64_t BitShift = ShiftOp->getZExtValue();
13594 if (BitShift % 8 != 0)
13595 return std::nullopt;
13597 SrcIndex += BitShift / 8;
13615static const std::optional<ByteProvider<SDValue>>
13617 unsigned StartingIndex = 0) {
13621 return std::nullopt;
13623 unsigned BitWidth =
Op.getScalarValueSizeInBits();
13625 return std::nullopt;
13627 return std::nullopt;
13629 bool IsVec =
Op.getValueType().isVector();
13630 switch (
Op.getOpcode()) {
13633 return std::nullopt;
13638 return std::nullopt;
13642 return std::nullopt;
13645 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
13646 return std::nullopt;
13647 if (!
LHS ||
LHS->isConstantZero())
13649 if (!
RHS ||
RHS->isConstantZero())
13651 return std::nullopt;
13656 return std::nullopt;
13660 return std::nullopt;
13662 uint32_t BitMask = BitMaskOp->getZExtValue();
13664 uint32_t IndexMask = 0xFF << (Index * 8);
13666 if ((IndexMask & BitMask) != IndexMask) {
13669 if (IndexMask & BitMask)
13670 return std::nullopt;
13679 return std::nullopt;
13683 if (!ShiftOp ||
Op.getValueType().isVector())
13684 return std::nullopt;
13686 uint64_t BitsProvided =
Op.getValueSizeInBits();
13687 if (BitsProvided % 8 != 0)
13688 return std::nullopt;
13690 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13692 return std::nullopt;
13694 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13695 uint64_t ByteShift = BitShift / 8;
13697 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13698 uint64_t BytesProvided = BitsProvided / 8;
13699 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13700 NewIndex %= BytesProvided;
13707 return std::nullopt;
13711 return std::nullopt;
13713 uint64_t BitShift = ShiftOp->getZExtValue();
13715 return std::nullopt;
13717 auto BitsProvided =
Op.getScalarValueSizeInBits();
13718 if (BitsProvided % 8 != 0)
13719 return std::nullopt;
13721 uint64_t BytesProvided = BitsProvided / 8;
13722 uint64_t ByteShift = BitShift / 8;
13727 return BytesProvided - ByteShift > Index
13735 return std::nullopt;
13739 return std::nullopt;
13741 uint64_t BitShift = ShiftOp->getZExtValue();
13742 if (BitShift % 8 != 0)
13743 return std::nullopt;
13744 uint64_t ByteShift = BitShift / 8;
13750 return Index < ByteShift
13753 Depth + 1, StartingIndex);
13762 return std::nullopt;
13770 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13772 if (NarrowBitWidth % 8 != 0)
13773 return std::nullopt;
13774 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13776 if (Index >= NarrowByteWidth)
13778 ? std::optional<ByteProvider<SDValue>>(
13786 return std::nullopt;
13790 if (NarrowByteWidth >= Index) {
13795 return std::nullopt;
13802 return std::nullopt;
13808 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13809 if (NarrowBitWidth % 8 != 0)
13810 return std::nullopt;
13811 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13816 if (Index >= NarrowByteWidth) {
13818 ? std::optional<ByteProvider<SDValue>>(
13823 if (NarrowByteWidth > Index) {
13827 return std::nullopt;
13832 return std::nullopt;
13835 Depth + 1, StartingIndex);
13841 return std::nullopt;
13842 auto VecIdx = IdxOp->getZExtValue();
13843 auto ScalarSize =
Op.getScalarValueSizeInBits();
13844 if (ScalarSize < 32)
13845 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13847 StartingIndex, Index);
13850 case AMDGPUISD::PERM: {
13852 return std::nullopt;
13856 return std::nullopt;
13859 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13860 if (IdxMask > 0x07 && IdxMask != 0x0c)
13861 return std::nullopt;
13863 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13864 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13866 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
13872 return std::nullopt;
13887 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
13894 auto MemVT = L->getMemoryVT();
13897 return L->getMemoryVT().getSizeInBits() == 16;
13907 int Low8 = Mask & 0xff;
13908 int Hi8 = (Mask & 0xff00) >> 8;
13910 assert(Low8 < 8 && Hi8 < 8);
13912 bool IsConsecutive = (Hi8 - Low8 == 1);
13917 bool Is16Aligned = !(Low8 % 2);
13919 return IsConsecutive && Is16Aligned;
13927 int Low16 = PermMask & 0xffff;
13928 int Hi16 = (PermMask & 0xffff0000) >> 16;
13938 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13940 if (!OtherOpIs16Bit)
13948 unsigned DWordOffset) {
13953 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13958 if (Src.getValueType().isVector()) {
13959 auto ScalarTySize = Src.getScalarValueSizeInBits();
13960 auto ScalarTy = Src.getValueType().getScalarType();
13961 if (ScalarTySize == 32) {
13965 if (ScalarTySize > 32) {
13968 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13969 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13976 assert(ScalarTySize < 32);
13977 auto NumElements =
TypeSize / ScalarTySize;
13978 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13979 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13980 auto NumElementsIn32 = 32 / ScalarTySize;
13981 auto NumAvailElements = DWordOffset < Trunc32Elements
13983 : NumElements - NormalizedTrunc;
13996 auto ShiftVal = 32 * DWordOffset;
14004 [[maybe_unused]]
EVT VT =
N->getValueType(0);
14009 for (
int i = 0; i < 4; i++) {
14011 std::optional<ByteProvider<SDValue>>
P =
14014 if (!
P ||
P->isConstantZero())
14019 if (PermNodes.
size() != 4)
14022 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14023 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14025 for (
size_t i = 0; i < PermNodes.
size(); i++) {
14026 auto PermOp = PermNodes[i];
14029 int SrcByteAdjust = 4;
14033 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14034 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14036 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14037 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14041 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14042 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14045 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14047 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14050 SDValue Op = *PermNodes[FirstSrc.first].Src;
14052 assert(
Op.getValueSizeInBits() == 32);
14056 int Low16 = PermMask & 0xffff;
14057 int Hi16 = (PermMask & 0xffff0000) >> 16;
14059 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14060 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14063 if (WellFormedLow && WellFormedHi)
14067 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
14076 (
N->getOperand(0) ==
Op ||
N->getOperand(0) == OtherOp) &&
14077 (
N->getOperand(1) ==
Op ||
N->getOperand(1) == OtherOp))
14082 assert(
Op.getValueType().isByteSized() &&
14093 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op, OtherOp,
14100 DAGCombinerInfo &DCI)
const {
14101 SelectionDAG &DAG = DCI.DAG;
14105 EVT VT =
N->getValueType(0);
14106 if (VT == MVT::i1) {
14108 if (
LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14109 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14111 if (Src !=
RHS.getOperand(0))
14116 if (!CLHS || !CRHS)
14120 static const uint32_t MaxMask = 0x3ff;
14125 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1, Src,
14134 LHS.getOpcode() == AMDGPUISD::PERM &&
14140 Sel |=
LHS.getConstantOperandVal(2);
14142 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14149 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14153 auto usesCombinedOperand = [](SDNode *OrUse) {
14155 if (OrUse->getOpcode() != ISD::BITCAST ||
14156 !OrUse->getValueType(0).isVector())
14160 for (
auto *VUser : OrUse->users()) {
14161 if (!VUser->getValueType(0).isVector())
14168 if (VUser->getOpcode() == VectorwiseOp)
14174 if (!
any_of(
N->users(), usesCombinedOperand))
14180 if (LHSMask != ~0u && RHSMask != ~0u) {
14183 if (LHSMask > RHSMask) {
14190 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14191 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14194 if (!(LHSUsedLanes & RHSUsedLanes) &&
14197 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14199 LHSMask &= ~RHSUsedLanes;
14200 RHSMask &= ~LHSUsedLanes;
14202 LHSMask |= LHSUsedLanes & 0x04040404;
14204 uint32_t Sel = LHSMask | RHSMask;
14207 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14212 if (LHSMask == ~0u || RHSMask == ~0u) {
14253 return IdentitySrc;
14259 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14274 if (SrcVT == MVT::i32) {
14279 DCI.AddToWorklist(LowOr.
getNode());
14280 DCI.AddToWorklist(HiBits.getNode());
14284 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14291 N->getOperand(0), CRHS))
14299 DAGCombinerInfo &DCI)
const {
14300 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
14307 SelectionDAG &DAG = DCI.DAG;
14309 EVT VT =
N->getValueType(0);
14310 if (CRHS && VT == MVT::i64) {
14312 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
14319 unsigned Opc =
LHS.getOpcode();
14343 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(1));
14345 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(2));
14349 LHS->getOperand(0), FNegLHS, FNegRHS);
14350 return DAG.
getNode(ISD::BITCAST,
DL, VT, NewSelect);
14358 DAGCombinerInfo &DCI)
const {
14359 if (!Subtarget->has16BitInsts() ||
14363 EVT VT =
N->getValueType(0);
14364 if (VT != MVT::i32)
14368 if (Src.getValueType() != MVT::i16)
14375SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
14376 DAGCombinerInfo &DCI)
const {
14382 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14383 VTSign->getVT() == MVT::i8) ||
14384 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14385 VTSign->getVT() == MVT::i16))) {
14386 assert(Subtarget->hasScalarSubwordLoads() &&
14387 "s_buffer_load_{u8, i8} are supported "
14388 "in GFX12 (or newer) architectures.");
14389 EVT VT = Src.getValueType();
14390 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14391 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14392 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14394 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14401 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14402 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14406 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14407 VTSign->getVT() == MVT::i8) ||
14408 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14409 VTSign->getVT() == MVT::i16)) &&
14418 Src.getOperand(6), Src.getOperand(7)};
14421 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14422 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14423 ? AMDGPUISD::BUFFER_LOAD_BYTE
14424 : AMDGPUISD::BUFFER_LOAD_SHORT;
14425 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14426 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14427 return DCI.DAG.getMergeValues(
14428 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
14434 DAGCombinerInfo &DCI)
const {
14435 SelectionDAG &DAG = DCI.DAG;
14442 if (
N->getOperand(0).isUndef())
14449 DAGCombinerInfo &DCI)
const {
14450 EVT VT =
N->getValueType(0);
14460 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(
N), VT, N0,
14465 if ((VT == MVT::f16 && N0.
getOpcode() == ISD::FSQRT) &&
14467 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(
N), VT, N0.
getOperand(0),
14475 unsigned MaxDepth)
const {
14476 unsigned Opcode =
Op.getOpcode();
14481 const auto &
F = CFP->getValueAPF();
14482 if (
F.isNaN() &&
F.isSignaling())
14484 if (!
F.isDenormal())
14510 case ISD::FP_EXTEND:
14511 case ISD::FP16_TO_FP:
14512 case ISD::FP_TO_FP16:
14513 case ISD::BF16_TO_FP:
14514 case ISD::FP_TO_BF16:
14516 case AMDGPUISD::FMUL_LEGACY:
14517 case AMDGPUISD::FMAD_FTZ:
14518 case AMDGPUISD::RCP:
14519 case AMDGPUISD::RSQ:
14520 case AMDGPUISD::RSQ_CLAMP:
14521 case AMDGPUISD::RCP_LEGACY:
14522 case AMDGPUISD::RCP_IFLAG:
14523 case AMDGPUISD::LOG:
14524 case AMDGPUISD::EXP:
14525 case AMDGPUISD::DIV_SCALE:
14526 case AMDGPUISD::DIV_FMAS:
14527 case AMDGPUISD::DIV_FIXUP:
14528 case AMDGPUISD::FRACT:
14529 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14530 case AMDGPUISD::CVT_F32_UBYTE0:
14531 case AMDGPUISD::CVT_F32_UBYTE1:
14532 case AMDGPUISD::CVT_F32_UBYTE2:
14533 case AMDGPUISD::CVT_F32_UBYTE3:
14534 case AMDGPUISD::FP_TO_FP16:
14535 case AMDGPUISD::SIN_HW:
14536 case AMDGPUISD::COS_HW:
14547 if (
Op.getValueType() == MVT::i32) {
14553 if (RHS->getZExtValue() == 0xffff0000) {
14563 return Op.getValueType().getScalarType() != MVT::f16;
14567 case ISD::FMINNUM_IEEE:
14568 case ISD::FMAXNUM_IEEE:
14569 case ISD::FMINIMUM:
14570 case ISD::FMAXIMUM:
14571 case ISD::FMINIMUMNUM:
14572 case ISD::FMAXIMUMNUM:
14573 case AMDGPUISD::CLAMP:
14574 case AMDGPUISD::FMED3:
14575 case AMDGPUISD::FMAX3:
14576 case AMDGPUISD::FMIN3:
14577 case AMDGPUISD::FMAXIMUM3:
14578 case AMDGPUISD::FMINIMUM3: {
14584 if (Subtarget->supportsMinMaxDenormModes() ||
14594 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
14606 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
14633 if (
Op.getValueType() == MVT::i16) {
14636 TruncSrc.
getOpcode() == ISD::BITCAST &&
14644 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
14646 switch (IntrinsicID) {
14647 case Intrinsic::amdgcn_cvt_pkrtz:
14648 case Intrinsic::amdgcn_cubeid:
14649 case Intrinsic::amdgcn_frexp_mant:
14650 case Intrinsic::amdgcn_fdot2:
14651 case Intrinsic::amdgcn_rcp:
14652 case Intrinsic::amdgcn_rsq:
14653 case Intrinsic::amdgcn_rsq_clamp:
14654 case Intrinsic::amdgcn_rcp_legacy:
14655 case Intrinsic::amdgcn_rsq_legacy:
14656 case Intrinsic::amdgcn_trig_preop:
14657 case Intrinsic::amdgcn_tanh:
14658 case Intrinsic::amdgcn_log:
14659 case Intrinsic::amdgcn_exp2:
14660 case Intrinsic::amdgcn_sqrt:
14678 unsigned MaxDepth)
const {
14681 unsigned Opcode =
MI->getOpcode();
14683 if (Opcode == AMDGPU::G_FCANONICALIZE)
14686 std::optional<FPValueAndVReg> FCR;
14689 if (FCR->Value.isSignaling())
14691 if (!FCR->Value.isDenormal())
14702 case AMDGPU::G_FADD:
14703 case AMDGPU::G_FSUB:
14704 case AMDGPU::G_FMUL:
14705 case AMDGPU::G_FCEIL:
14706 case AMDGPU::G_FFLOOR:
14707 case AMDGPU::G_FRINT:
14708 case AMDGPU::G_FNEARBYINT:
14709 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14710 case AMDGPU::G_INTRINSIC_TRUNC:
14711 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14712 case AMDGPU::G_FMA:
14713 case AMDGPU::G_FMAD:
14714 case AMDGPU::G_FSQRT:
14715 case AMDGPU::G_FDIV:
14716 case AMDGPU::G_FREM:
14717 case AMDGPU::G_FPOW:
14718 case AMDGPU::G_FPEXT:
14719 case AMDGPU::G_FLOG:
14720 case AMDGPU::G_FLOG2:
14721 case AMDGPU::G_FLOG10:
14722 case AMDGPU::G_FPTRUNC:
14723 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14724 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14725 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14726 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14727 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14729 case AMDGPU::G_FNEG:
14730 case AMDGPU::G_FABS:
14731 case AMDGPU::G_FCOPYSIGN:
14733 case AMDGPU::G_FMINNUM:
14734 case AMDGPU::G_FMAXNUM:
14735 case AMDGPU::G_FMINNUM_IEEE:
14736 case AMDGPU::G_FMAXNUM_IEEE:
14737 case AMDGPU::G_FMINIMUM:
14738 case AMDGPU::G_FMAXIMUM:
14739 case AMDGPU::G_FMINIMUMNUM:
14740 case AMDGPU::G_FMAXIMUMNUM: {
14741 if (Subtarget->supportsMinMaxDenormModes() ||
14748 case AMDGPU::G_BUILD_VECTOR:
14753 case AMDGPU::G_INTRINSIC:
14754 case AMDGPU::G_INTRINSIC_CONVERGENT:
14756 case Intrinsic::amdgcn_fmul_legacy:
14757 case Intrinsic::amdgcn_fmad_ftz:
14758 case Intrinsic::amdgcn_sqrt:
14759 case Intrinsic::amdgcn_fmed3:
14760 case Intrinsic::amdgcn_sin:
14761 case Intrinsic::amdgcn_cos:
14762 case Intrinsic::amdgcn_log:
14763 case Intrinsic::amdgcn_exp2:
14764 case Intrinsic::amdgcn_log_clamp:
14765 case Intrinsic::amdgcn_rcp:
14766 case Intrinsic::amdgcn_rcp_legacy:
14767 case Intrinsic::amdgcn_rsq:
14768 case Intrinsic::amdgcn_rsq_clamp:
14769 case Intrinsic::amdgcn_rsq_legacy:
14770 case Intrinsic::amdgcn_div_scale:
14771 case Intrinsic::amdgcn_div_fmas:
14772 case Intrinsic::amdgcn_div_fixup:
14773 case Intrinsic::amdgcn_fract:
14774 case Intrinsic::amdgcn_cvt_pkrtz:
14775 case Intrinsic::amdgcn_cubeid:
14776 case Intrinsic::amdgcn_cubema:
14777 case Intrinsic::amdgcn_cubesc:
14778 case Intrinsic::amdgcn_cubetc:
14779 case Intrinsic::amdgcn_frexp_mant:
14780 case Intrinsic::amdgcn_fdot2:
14781 case Intrinsic::amdgcn_trig_preop:
14782 case Intrinsic::amdgcn_tanh:
14801 if (
C.isDenormal()) {
14815 if (
C.isSignaling()) {
14838SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
14839 DAGCombinerInfo &DCI)
const {
14840 SelectionDAG &DAG = DCI.DAG;
14842 EVT VT =
N->getValueType(0);
14851 EVT VT =
N->getValueType(0);
14852 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
14868 EVT EltVT =
Lo.getValueType();
14871 for (
unsigned I = 0;
I != 2; ++
I) {
14875 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14876 }
else if (
Op.isUndef()) {
14910 case ISD::FMAXNUM_IEEE:
14911 case ISD::FMAXIMUMNUM:
14912 return AMDGPUISD::FMAX3;
14913 case ISD::FMAXIMUM:
14914 return AMDGPUISD::FMAXIMUM3;
14916 return AMDGPUISD::SMAX3;
14918 return AMDGPUISD::UMAX3;
14920 case ISD::FMINNUM_IEEE:
14921 case ISD::FMINIMUMNUM:
14922 return AMDGPUISD::FMIN3;
14923 case ISD::FMINIMUM:
14924 return AMDGPUISD::FMINIMUM3;
14926 return AMDGPUISD::SMIN3;
14928 return AMDGPUISD::UMIN3;
14949 if (!MinK || !MaxK)
14961 unsigned Med3Opc =
Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14962 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14963 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
15022 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15028 if (
Info->getMode().DX10Clamp) {
15037 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15065 case ISD::FMINNUM_IEEE:
15066 case ISD::FMAXNUM_IEEE:
15067 case ISD::FMINIMUMNUM:
15068 case ISD::FMAXIMUMNUM:
15069 case AMDGPUISD::FMIN_LEGACY:
15070 case AMDGPUISD::FMAX_LEGACY:
15071 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
15073 case ISD::FMINIMUM:
15074 case ISD::FMAXIMUM:
15082 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
15091 DAGCombinerInfo &DCI)
const {
15092 SelectionDAG &DAG = DCI.DAG;
15124 if (
SDValue Med3 = performIntMed3ImmCombine(
15129 if (
SDValue Med3 = performIntMed3ImmCombine(
15135 if (
SDValue Med3 = performIntMed3ImmCombine(
15140 if (
SDValue Med3 = performIntMed3ImmCombine(
15150 if (((
Opc == ISD::FMINNUM && Op0.
getOpcode() == ISD::FMAXNUM) ||
15151 (
Opc == ISD::FMINNUM_IEEE && Op0.
getOpcode() == ISD::FMAXNUM_IEEE) ||
15152 (
Opc == ISD::FMINIMUMNUM && Op0.
getOpcode() == ISD::FMAXIMUMNUM) ||
15153 (
Opc == AMDGPUISD::FMIN_LEGACY &&
15154 Op0.
getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15155 (VT == MVT::f32 || VT == MVT::f64 ||
15156 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15157 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15158 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15159 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15161 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1))
15168 const SDNodeFlags
Flags =
N->getFlags();
15169 if ((
Opc == ISD::FMINIMUM ||
Opc == ISD::FMAXIMUM) &&
15170 !Subtarget->hasIEEEMinimumMaximumInsts() &&
Flags.hasNoNaNs()) {
15172 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
15173 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
15183 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15184 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15193 DAGCombinerInfo &DCI)
const {
15194 EVT VT =
N->getValueType(0);
15198 SelectionDAG &DAG = DCI.DAG;
15209 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15213 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15217 if (
Info->getMode().DX10Clamp) {
15230 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15237 DAGCombinerInfo &DCI)
const {
15241 return DCI.DAG.getUNDEF(
N->getValueType(0));
15249 bool IsDivergentIdx,
15254 unsigned VecSize = EltSize * NumElem;
15257 if (VecSize <= 64 && EltSize < 32)
15266 if (IsDivergentIdx)
15270 unsigned NumInsts = NumElem +
15271 ((EltSize + 31) / 32) * NumElem ;
15275 if (Subtarget->useVGPRIndexMode())
15276 return NumInsts <= 16;
15280 if (Subtarget->hasMovrel())
15281 return NumInsts <= 15;
15287 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
15302SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
15303 DAGCombinerInfo &DCI)
const {
15309 EVT ResVT =
N->getValueType(0);
15333 if (!
C ||
C->getZExtValue() != 0x1f)
15349 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15368 case ISD::FMAXNUM_IEEE:
15369 case ISD::FMINNUM_IEEE:
15370 case ISD::FMAXIMUM:
15371 case ISD::FMINIMUM: {
15377 DCI.AddToWorklist(Elt0.
getNode());
15378 DCI.AddToWorklist(Elt1.
getNode());
15400 if (!DCI.isBeforeLegalize())
15408 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15411 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15412 unsigned EltIdx = BitIndex / 32;
15413 unsigned LeftoverBitIdx = BitIndex % 32;
15417 DCI.AddToWorklist(Cast.
getNode());
15421 DCI.AddToWorklist(Elt.
getNode());
15424 DCI.AddToWorklist(Srl.
getNode());
15428 DCI.AddToWorklist(Trunc.
getNode());
15430 if (VecEltVT == ResVT) {
15431 return DAG.
getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15442SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
15443 DAGCombinerInfo &DCI)
const {
15454 SelectionDAG &DAG = DCI.DAG;
15473 if (Src.getOpcode() == ISD::FP_EXTEND &&
15474 Src.getOperand(0).getValueType() == MVT::f16) {
15475 return Src.getOperand(0);
15479 APFloat Val = CFP->getValueAPF();
15480 bool LosesInfo =
true;
15490 DAGCombinerInfo &DCI)
const {
15491 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15492 "combine only useful on gfx8");
15494 SDValue TruncSrc =
N->getOperand(0);
15495 EVT VT =
N->getValueType(0);
15496 if (VT != MVT::f16)
15499 if (TruncSrc.
getOpcode() != AMDGPUISD::FMED3 ||
15503 SelectionDAG &DAG = DCI.DAG;
15531 return DAG.
getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15534unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
15536 const SDNode *N1)
const {
15541 if (((VT == MVT::f32 &&
15543 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15563 EVT VT =
N->getValueType(0);
15564 if (VT != MVT::i32 && VT != MVT::i64)
15570 unsigned Opc =
N->getOpcode();
15625 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
15644 DAGCombinerInfo &DCI)
const {
15647 SelectionDAG &DAG = DCI.DAG;
15648 EVT VT =
N->getValueType(0);
15658 if (!
N->isDivergent() && Subtarget->hasSMulHi())
15662 if (NumBits <= 32 || NumBits > 64)
15673 if (!Subtarget->hasFullRate64Ops()) {
15674 unsigned NumUsers = 0;
15675 for (SDNode *User :
LHS->
users()) {
15678 if (!
User->isAnyAdd())
15702 bool MulSignedLo =
false;
15703 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15712 if (VT != MVT::i64) {
15735 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15737 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15738 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15740 if (!MulLHSUnsigned32) {
15747 if (!MulRHSUnsigned32) {
15758 if (VT != MVT::i64)
15764SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
15765 DAGCombinerInfo &DCI)
const {
15775 SelectionDAG &DAG = DCI.DAG;
15790 unsigned Opcode =
N->getOpcode();
15791 if (Opcode == ISD::PTRADD)
15794 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
15805static std::optional<ByteProvider<SDValue>>
15808 if (!Byte0 || Byte0->isConstantZero()) {
15809 return std::nullopt;
15812 if (Byte1 && !Byte1->isConstantZero()) {
15813 return std::nullopt;
15819 unsigned FirstCs =
First & 0x0c0c0c0c;
15820 unsigned SecondCs = Second & 0x0c0c0c0c;
15821 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
15822 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15824 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15825 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15826 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15827 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15829 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15853 for (
int BPI = 0; BPI < 2; BPI++) {
15856 BPP = {Src1, Src0};
15858 unsigned ZeroMask = 0x0c0c0c0c;
15859 unsigned FMask = 0xFF << (8 * (3 - Step));
15861 unsigned FirstMask =
15862 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15863 unsigned SecondMask =
15864 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15868 int FirstGroup = -1;
15869 for (
int I = 0;
I < 2;
I++) {
15871 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
15872 return IterElt.SrcOp == *BPP.first.Src &&
15873 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15877 if (Match != Srcs.
end()) {
15878 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
15883 if (FirstGroup != -1) {
15885 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
15886 return IterElt.SrcOp == *BPP.second.Src &&
15887 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15890 if (Match != Srcs.
end()) {
15891 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
15893 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15901 unsigned ZeroMask = 0x0c0c0c0c;
15902 unsigned FMask = 0xFF << (8 * (3 - Step));
15906 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15910 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15919 if (Srcs.
size() == 1) {
15920 auto *Elt = Srcs.
begin();
15924 if (Elt->PermMask == 0x3020100)
15927 return DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15931 auto *FirstElt = Srcs.
begin();
15932 auto *SecondElt = std::next(FirstElt);
15939 auto FirstMask = FirstElt->PermMask;
15940 auto SecondMask = SecondElt->PermMask;
15942 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15943 unsigned FirstPlusFour = FirstMask | 0x04040404;
15946 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15958 FirstElt = std::next(SecondElt);
15959 if (FirstElt == Srcs.
end())
15962 SecondElt = std::next(FirstElt);
15965 if (SecondElt == Srcs.
end()) {
15970 DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15971 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
15977 return Perms.
size() == 2
15983 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15984 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15985 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15986 EntryMask += ZeroMask;
15991 auto Opcode =
Op.getOpcode();
15993 return (Opcode ==
ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15994 Opcode == AMDGPUISD::MUL_I24);
15997static std::optional<bool>
16008 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16011 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16013 assert(!(S0IsUnsigned && S0IsSigned));
16014 assert(!(S1IsUnsigned && S1IsSigned));
16022 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16028 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16029 return std::nullopt;
16041 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16042 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16047 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16053 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16054 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16055 return std::nullopt;
16061 DAGCombinerInfo &DCI)
const {
16062 SelectionDAG &DAG = DCI.DAG;
16063 EVT VT =
N->getValueType(0);
16069 if (Subtarget->hasMad64_32()) {
16070 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16075 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
16079 if (VT == MVT::i64) {
16080 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16085 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16087 std::optional<bool> IsSigned;
16093 int ChainLength = 0;
16094 for (
int I = 0;
I < 4;
I++) {
16098 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16101 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16106 TempNode->getOperand(MulIdx), *Src0, *Src1,
16107 TempNode->getOperand(MulIdx)->getOperand(0),
16108 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16112 IsSigned = *IterIsSigned;
16113 if (*IterIsSigned != *IsSigned)
16116 auto AddIdx = 1 - MulIdx;
16119 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
16120 Src2s.
push_back(TempNode->getOperand(AddIdx));
16130 TempNode->getOperand(AddIdx), *Src0, *Src1,
16131 TempNode->getOperand(AddIdx)->getOperand(0),
16132 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16136 if (*IterIsSigned != *IsSigned)
16140 ChainLength =
I + 2;
16144 TempNode = TempNode->getOperand(AddIdx);
16146 ChainLength =
I + 1;
16147 if (TempNode->getNumOperands() < 2)
16149 LHS = TempNode->getOperand(0);
16150 RHS = TempNode->getOperand(1);
16153 if (ChainLength < 2)
16159 if (ChainLength < 4) {
16169 bool UseOriginalSrc =
false;
16170 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
16171 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
16172 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
16173 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
16174 SmallVector<unsigned, 4> SrcBytes;
16175 auto Src0Mask = Src0s.
begin()->PermMask;
16176 SrcBytes.
push_back(Src0Mask & 0xFF000000);
16177 bool UniqueEntries =
true;
16178 for (
auto I = 1;
I < 4;
I++) {
16179 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
16182 UniqueEntries =
false;
16188 if (UniqueEntries) {
16189 UseOriginalSrc =
true;
16191 auto *FirstElt = Src0s.
begin();
16195 auto *SecondElt = Src1s.
begin();
16197 SecondElt->DWordOffset);
16206 if (!UseOriginalSrc) {
16213 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16216 : Intrinsic::amdgcn_udot4,
16226 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16231 unsigned Opc =
LHS.getOpcode();
16243 auto Cond =
RHS.getOperand(0);
16248 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16265 DAGCombinerInfo &DCI)
const {
16266 SelectionDAG &DAG = DCI.DAG;
16268 EVT VT =
N->getValueType(0);
16281 SDNodeFlags ShlFlags = N1->
getFlags();
16285 SDNodeFlags NewShlFlags =
16290 DCI.AddToWorklist(Inner.
getNode());
16297 if (Subtarget->hasMad64_32()) {
16298 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16307 if (VT == MVT::i64) {
16308 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16321 if (!YIsConstant && !ZIsConstant && !
X->isDivergent() &&
16322 Y->isDivergent() !=
Z->isDivergent()) {
16331 if (
Y->isDivergent())
16334 SDNodeFlags ReassocFlags =
16337 DCI.AddToWorklist(UniformInner.
getNode());
16345 DAGCombinerInfo &DCI)
const {
16346 SelectionDAG &DAG = DCI.DAG;
16347 EVT VT =
N->getValueType(0);
16349 if (VT == MVT::i64) {
16350 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16354 if (VT != MVT::i32)
16363 unsigned Opc =
RHS.getOpcode();
16370 auto Cond =
RHS.getOperand(0);
16375 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16393SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
16394 DAGCombinerInfo &DCI)
const {
16396 if (
N->getValueType(0) != MVT::i32)
16402 SelectionDAG &DAG = DCI.DAG;
16407 unsigned LHSOpc =
LHS.getOpcode();
16408 unsigned Opc =
N->getOpcode();
16412 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
16418 DAGCombinerInfo &DCI)
const {
16422 SelectionDAG &DAG = DCI.DAG;
16423 EVT VT =
N->getValueType(0);
16435 if (
A ==
LHS.getOperand(1)) {
16436 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16437 if (FusedOp != 0) {
16439 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
16447 if (
A ==
RHS.getOperand(1)) {
16448 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16449 if (FusedOp != 0) {
16451 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
16460 DAGCombinerInfo &DCI)
const {
16464 SelectionDAG &DAG = DCI.DAG;
16466 EVT VT =
N->getValueType(0);
16479 if (
A ==
LHS.getOperand(1)) {
16480 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16481 if (FusedOp != 0) {
16485 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
16494 if (
A ==
RHS.getOperand(1)) {
16495 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16496 if (FusedOp != 0) {
16498 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
16507 DAGCombinerInfo &DCI)
const {
16508 SelectionDAG &DAG = DCI.DAG;
16510 EVT VT =
N->getValueType(0);
16513 if ((VT != MVT::f16 && VT != MVT::bf16) || !
isOperationLegal(ISD::FSQRT, VT))
16519 SDNodeFlags
Flags =
N->getFlags();
16520 SDNodeFlags RHSFlags =
RHS->getFlags();
16526 bool IsNegative =
false;
16527 if (CLHS->isExactlyValue(1.0) ||
16528 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16531 if (
RHS.getOpcode() == ISD::FSQRT) {
16534 DAG.
getNode(AMDGPUISD::RSQ, SL, VT,
RHS.getOperand(0), Flags);
16535 return IsNegative ? DAG.
getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16544 DAGCombinerInfo &DCI)
const {
16545 SelectionDAG &DAG = DCI.DAG;
16546 EVT VT =
N->getValueType(0);
16550 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
16551 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16566 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16571 const ConstantFPSDNode *FalseNode =
16581 if (ScalarVT == MVT::f32 &&
16587 if (TrueNodeExpVal == INT_MIN)
16590 if (FalseNodeExpVal == INT_MIN)
16603 return DAG.
getNode(ISD::FLDEXP, SL, VT,
LHS, SelectNode,
N->getFlags());
16610 DAGCombinerInfo &DCI)
const {
16611 SelectionDAG &DAG = DCI.DAG;
16612 EVT VT =
N->getValueType(0);
16615 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16633 (
N->getFlags().hasAllowContract() &&
16634 FMA->getFlags().hasAllowContract())) {
16649 if (FMAOp1.
getOpcode() != ISD::FP_EXTEND ||
16668 if (Vec1 == Vec2 || Vec3 == Vec4)
16674 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16675 return DAG.
getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16683 DAGCombinerInfo &DCI)
const {
16684 SelectionDAG &DAG = DCI.DAG;
16689 EVT VT =
LHS.getValueType();
16718 return LHS.getOperand(0);
16726 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
16733 const APInt &CT =
LHS.getConstantOperandAPInt(1);
16734 const APInt &CF =
LHS.getConstantOperandAPInt(2);
16742 return LHS.getOperand(0);
16774 DAG.
getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
16779 {Op0Hi, Op1Hi, CarryInHi});
16789 DCI.CombineTo(
LHS.getNode(), Result);
16793 if (VT != MVT::f32 && VT != MVT::f64 &&
16794 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16802 LHS.getOpcode() == ISD::FABS) {
16809 const unsigned IsInfMask =
16811 const unsigned IsFiniteMask =
16816 return DAG.
getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1,
LHS.getOperand(0),
16825SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
16826 DAGCombinerInfo &DCI)
const {
16827 SelectionDAG &DAG = DCI.DAG;
16829 unsigned Offset =
N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16848 unsigned ShiftOffset = 8 *
Offset;
16850 ShiftOffset -=
C->getZExtValue();
16852 ShiftOffset +=
C->getZExtValue();
16854 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16855 return DAG.
getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16856 MVT::f32, Shifted);
16867 DCI.AddToWorklist(
N);
16874 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
16880 DAGCombinerInfo &DCI)
const {
16885 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16889 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16890 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
16893 APFloat One(
F.getSemantics(),
"1.0");
16895 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
16901 DAGCombinerInfo &DCI)
const {
16922 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
16923 bool isInteger =
LHS.getValueType().isInteger();
16926 if (!isFloatingPoint && !isInteger)
16931 if (!isEquality && !isNonEquality)
16948 if (isFloatingPoint) {
16950 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16961 if (!(isEquality && TrueVal == ConstVal) &&
16962 !(isNonEquality && FalseVal == ConstVal))
16969 SelectLHS, SelectRHS);
16974 switch (
N->getOpcode()) {
16990 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
17000 switch (
N->getOpcode()) {
17002 return performAddCombine(
N, DCI);
17004 return performPtrAddCombine(
N, DCI);
17006 return performSubCombine(
N, DCI);
17009 return performAddCarrySubCarryCombine(
N, DCI);
17011 return performFAddCombine(
N, DCI);
17013 return performFSubCombine(
N, DCI);
17015 return performFDivCombine(
N, DCI);
17017 return performFMulCombine(
N, DCI);
17019 return performSetCCCombine(
N, DCI);
17021 if (
auto Res = performSelectCombine(
N, DCI))
17026 case ISD::FMAXNUM_IEEE:
17027 case ISD::FMINNUM_IEEE:
17028 case ISD::FMAXIMUM:
17029 case ISD::FMINIMUM:
17030 case ISD::FMAXIMUMNUM:
17031 case ISD::FMINIMUMNUM:
17036 case AMDGPUISD::FMIN_LEGACY:
17037 case AMDGPUISD::FMAX_LEGACY:
17038 return performMinMaxCombine(
N, DCI);
17040 return performFMACombine(
N, DCI);
17042 return performAndCombine(
N, DCI);
17044 return performOrCombine(
N, DCI);
17047 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
17048 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
17054 return performXorCombine(
N, DCI);
17056 return performZeroExtendCombine(
N, DCI);
17058 return performSignExtendInRegCombine(
N, DCI);
17059 case AMDGPUISD::FP_CLASS:
17060 return performClassCombine(
N, DCI);
17062 return performFCanonicalizeCombine(
N, DCI);
17063 case AMDGPUISD::RCP:
17064 return performRcpCombine(
N, DCI);
17066 case AMDGPUISD::FRACT:
17067 case AMDGPUISD::RSQ:
17068 case AMDGPUISD::RCP_LEGACY:
17069 case AMDGPUISD::RCP_IFLAG:
17070 case AMDGPUISD::RSQ_CLAMP: {
17079 return performUCharToFloatCombine(
N, DCI);
17081 return performFCopySignCombine(
N, DCI);
17082 case AMDGPUISD::CVT_F32_UBYTE0:
17083 case AMDGPUISD::CVT_F32_UBYTE1:
17084 case AMDGPUISD::CVT_F32_UBYTE2:
17085 case AMDGPUISD::CVT_F32_UBYTE3:
17086 return performCvtF32UByteNCombine(
N, DCI);
17087 case AMDGPUISD::FMED3:
17088 return performFMed3Combine(
N, DCI);
17089 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17090 return performCvtPkRTZCombine(
N, DCI);
17091 case AMDGPUISD::CLAMP:
17092 return performClampCombine(
N, DCI);
17095 EVT VT =
N->getValueType(0);
17098 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17101 EVT EltVT = Src.getValueType();
17102 if (EltVT != MVT::i16)
17103 Src = DAG.
getNode(ISD::BITCAST, SL, MVT::i16, Src);
17106 return DAG.
getNode(ISD::BITCAST, SL, VT, Ext);
17112 return performExtractVectorEltCombine(
N, DCI);
17114 return performInsertVectorEltCombine(
N, DCI);
17116 return performFPRoundCombine(
N, DCI);
17125 return performMemSDNodeCombine(MemNode, DCI);
17156 unsigned Opcode =
Node->getMachineOpcode();
17159 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17160 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
17163 SDNode *
Users[5] = {
nullptr};
17165 unsigned DmaskIdx =
17166 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17167 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
17168 unsigned NewDmask = 0;
17169 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17170 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17171 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
17172 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
17173 unsigned TFCLane = 0;
17174 bool HasChain =
Node->getNumValues() > 1;
17176 if (OldDmask == 0) {
17184 TFCLane = OldBitsSet;
17188 for (SDUse &Use :
Node->uses()) {
17191 if (
Use.getResNo() != 0)
17194 SDNode *
User =
Use.getUser();
17197 if (!
User->isMachineOpcode() ||
17198 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17210 if (UsesTFC && Lane == TFCLane) {
17215 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17217 Dmask &= ~(1 << Comp);
17225 NewDmask |= 1 << Comp;
17230 bool NoChannels = !NewDmask;
17237 if (OldBitsSet == 1)
17243 if (NewDmask == OldDmask)
17252 unsigned NewChannels = BitsSet + UsesTFC;
17256 assert(NewOpcode != -1 &&
17257 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
17258 "failed to find equivalent MIMG op");
17266 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
17268 MVT ResultVT = NewChannels == 1
17271 : NewChannels == 5 ? 8
17273 SDVTList NewVTList =
17276 MachineSDNode *NewNode =
17285 if (NewChannels == 1) {
17295 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17300 if (i || !NoChannels)
17305 if (NewUser != User) {
17315 Idx = AMDGPU::sub1;
17318 Idx = AMDGPU::sub2;
17321 Idx = AMDGPU::sub3;
17324 Idx = AMDGPU::sub4;
17335 Op =
Op.getOperand(0);
17356 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17360 Node->getOperand(0), SL, VReg, SrcVal,
17366 return ToResultReg.
getNode();
17371 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
17373 Ops.push_back(
Node->getOperand(i));
17379 Node->getOperand(i).getValueType(),
17380 Node->getOperand(i)),
17392 unsigned Opcode =
Node->getMachineOpcode();
17394 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
17395 !
TII->isGather4(Opcode) &&
17397 return adjustWritemask(
Node, DAG);
17400 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17406 case AMDGPU::V_DIV_SCALE_F32_e64:
17407 case AMDGPU::V_DIV_SCALE_F64_e64: {
17417 (Src0 == Src1 || Src0 == Src2))
17473 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
17474 unsigned InitIdx = 0;
17476 if (
TII->isImage(
MI)) {
17484 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
17485 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
17486 unsigned D16Val = D16 ? D16->getImm() : 0;
17488 if (!TFEVal && !LWEVal)
17499 assert(MO_Dmask &&
"Expected dmask operand in instruction");
17501 unsigned dmask = MO_Dmask->
getImm();
17506 bool Packed = !Subtarget->hasUnpackedD16VMem();
17508 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17515 uint32_t DstSize =
TRI.getRegSizeInBits(*DstRC) / 32;
17516 if (DstSize < InitIdx)
17520 InitIdx =
TRI.getRegSizeInBits(*DstRC) / 32;
17528 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
17529 unsigned NewDst = 0;
17534 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17535 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17538 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17539 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
17559 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
17571 if (
TII->isVOP3(
MI.getOpcode())) {
17573 TII->legalizeOperandsVOP3(
MRI,
MI);
17575 if (
TII->isMAI(
MI)) {
17580 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17581 AMDGPU::OpName::scale_src0);
17582 if (Src0Idx != -1) {
17583 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17584 AMDGPU::OpName::scale_src1);
17585 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
17586 TII->usesConstantBus(
MRI,
MI, Src1Idx))
17587 TII->legalizeOpWithMove(
MI, Src1Idx);
17594 if (
TII->isImage(
MI))
17595 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
17669std::pair<unsigned, const TargetRegisterClass *>
17676 if (Constraint.
size() == 1) {
17680 if (VT == MVT::Other)
17683 switch (Constraint[0]) {
17690 RC = &AMDGPU::SReg_32RegClass;
17693 RC = &AMDGPU::SGPR_64RegClass;
17698 return std::pair(0U,
nullptr);
17705 return std::pair(0U,
nullptr);
17707 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17708 : &AMDGPU::VGPR_32_Lo256RegClass;
17711 RC = Subtarget->has1024AddressableVGPRs()
17712 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
17715 return std::pair(0U,
nullptr);
17720 if (!Subtarget->hasMAIInsts())
17724 return std::pair(0U,
nullptr);
17726 RC = &AMDGPU::AGPR_32RegClass;
17731 return std::pair(0U,
nullptr);
17736 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
17740 RC = &AMDGPU::AV_32RegClass;
17743 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
17745 return std::pair(0U,
nullptr);
17754 return std::pair(0U, RC);
17757 if (Kind !=
'\0') {
17759 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17760 }
else if (Kind ==
's') {
17761 RC = &AMDGPU::SGPR_32RegClass;
17762 }
else if (Kind ==
'a') {
17763 RC = &AMDGPU::AGPR_32RegClass;
17769 return std::pair(0U,
nullptr);
17775 return std::pair(0U,
nullptr);
17779 RC =
TRI->getVGPRClassForBitWidth(Width);
17781 RC =
TRI->getSGPRClassForBitWidth(Width);
17783 RC =
TRI->getAGPRClassForBitWidth(Width);
17785 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17790 return std::pair(0U,
nullptr);
17792 return std::pair(Reg, RC);
17798 return std::pair(0U,
nullptr);
17799 if (Idx < RC->getNumRegs())
17801 return std::pair(0U,
nullptr);
17807 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
17813 if (Constraint.
size() == 1) {
17814 switch (Constraint[0]) {
17824 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
17832 if (Constraint.
size() == 1) {
17833 switch (Constraint[0]) {
17841 }
else if (Constraint.
size() == 2) {
17842 if (Constraint ==
"VA")
17860 std::vector<SDValue> &
Ops,
17875 unsigned Size =
Op.getScalarValueSizeInBits();
17879 if (
Size == 16 && !Subtarget->has16BitInsts())
17883 Val =
C->getSExtValue();
17887 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17891 if (
Size != 16 ||
Op.getNumOperands() != 2)
17893 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
17896 Val =
C->getSExtValue();
17900 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17910 if (Constraint.
size() == 1) {
17911 switch (Constraint[0]) {
17926 }
else if (Constraint.
size() == 2) {
17927 if (Constraint ==
"DA") {
17928 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
17929 int64_t LoBits =
static_cast<int32_t
>(Val);
17933 if (Constraint ==
"DB") {
17941 unsigned MaxSize)
const {
17942 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
17943 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17945 MVT VT =
Op.getSimpleValueType();
17970 switch (UnalignedClassID) {
17971 case AMDGPU::VReg_64RegClassID:
17972 return AMDGPU::VReg_64_Align2RegClassID;
17973 case AMDGPU::VReg_96RegClassID:
17974 return AMDGPU::VReg_96_Align2RegClassID;
17975 case AMDGPU::VReg_128RegClassID:
17976 return AMDGPU::VReg_128_Align2RegClassID;
17977 case AMDGPU::VReg_160RegClassID:
17978 return AMDGPU::VReg_160_Align2RegClassID;
17979 case AMDGPU::VReg_192RegClassID:
17980 return AMDGPU::VReg_192_Align2RegClassID;
17981 case AMDGPU::VReg_224RegClassID:
17982 return AMDGPU::VReg_224_Align2RegClassID;
17983 case AMDGPU::VReg_256RegClassID:
17984 return AMDGPU::VReg_256_Align2RegClassID;
17985 case AMDGPU::VReg_288RegClassID:
17986 return AMDGPU::VReg_288_Align2RegClassID;
17987 case AMDGPU::VReg_320RegClassID:
17988 return AMDGPU::VReg_320_Align2RegClassID;
17989 case AMDGPU::VReg_352RegClassID:
17990 return AMDGPU::VReg_352_Align2RegClassID;
17991 case AMDGPU::VReg_384RegClassID:
17992 return AMDGPU::VReg_384_Align2RegClassID;
17993 case AMDGPU::VReg_512RegClassID:
17994 return AMDGPU::VReg_512_Align2RegClassID;
17995 case AMDGPU::VReg_1024RegClassID:
17996 return AMDGPU::VReg_1024_Align2RegClassID;
17997 case AMDGPU::AReg_64RegClassID:
17998 return AMDGPU::AReg_64_Align2RegClassID;
17999 case AMDGPU::AReg_96RegClassID:
18000 return AMDGPU::AReg_96_Align2RegClassID;
18001 case AMDGPU::AReg_128RegClassID:
18002 return AMDGPU::AReg_128_Align2RegClassID;
18003 case AMDGPU::AReg_160RegClassID:
18004 return AMDGPU::AReg_160_Align2RegClassID;
18005 case AMDGPU::AReg_192RegClassID:
18006 return AMDGPU::AReg_192_Align2RegClassID;
18007 case AMDGPU::AReg_256RegClassID:
18008 return AMDGPU::AReg_256_Align2RegClassID;
18009 case AMDGPU::AReg_512RegClassID:
18010 return AMDGPU::AReg_512_Align2RegClassID;
18011 case AMDGPU::AReg_1024RegClassID:
18012 return AMDGPU::AReg_1024_Align2RegClassID;
18028 if (Info->isEntryFunction()) {
18035 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18037 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
18038 :
TRI->getAlignedHighSGPRForRC(MF, 2,
18039 &AMDGPU::SGPR_64RegClass);
18040 Info->setSGPRForEXECCopy(SReg);
18042 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
18043 Info->getStackPtrOffsetReg()));
18044 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18045 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
18049 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18050 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
18052 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18053 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
18055 Info->limitOccupancy(MF);
18057 if (ST.isWave32() && !MF.
empty()) {
18058 for (
auto &
MBB : MF) {
18059 for (
auto &
MI :
MBB) {
18060 TII->fixImplicitOperands(
MI);
18070 if (ST.needsAlignedVGPRs()) {
18071 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
18077 if (NewClassID != -1)
18078 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
18087 const APInt &DemandedElts,
18089 unsigned Depth)
const {
18091 unsigned Opc =
Op.getOpcode();
18094 unsigned IID =
Op.getConstantOperandVal(0);
18096 case Intrinsic::amdgcn_mbcnt_lo:
18097 case Intrinsic::amdgcn_mbcnt_hi: {
18103 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18113 Op, Known, DemandedElts, DAG,
Depth);
18129 unsigned MaxValue =
18136 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
18140 unsigned Src1Cst = 0;
18141 if (Src1.
isImm()) {
18142 Src1Cst = Src1.
getImm();
18143 }
else if (Src1.
isReg()) {
18147 Src1Cst = Cst->Value.getZExtValue();
18158 if (Width >= BFEWidth)
18167 Known = Known.
sext(BFEWidth);
18169 Known = Known.
zext(BFEWidth);
18175 unsigned Depth)
const {
18178 switch (
MI->getOpcode()) {
18179 case AMDGPU::S_BFE_I32:
18182 case AMDGPU::S_BFE_U32:
18185 case AMDGPU::S_BFE_I64:
18188 case AMDGPU::S_BFE_U64:
18191 case AMDGPU::G_INTRINSIC:
18192 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18195 case Intrinsic::amdgcn_workitem_id_x:
18198 case Intrinsic::amdgcn_workitem_id_y:
18201 case Intrinsic::amdgcn_workitem_id_z:
18204 case Intrinsic::amdgcn_mbcnt_lo:
18205 case Intrinsic::amdgcn_mbcnt_hi: {
18217 case Intrinsic::amdgcn_groupstaticsize: {
18228 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18231 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18234 case AMDGPU::G_AMDGPU_SMED3:
18235 case AMDGPU::G_AMDGPU_UMED3: {
18236 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
18263 unsigned Depth)
const {
18270 AttributeList Attrs =
18272 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
18299 if (Header->getAlignment() != PrefAlign)
18300 return Header->getAlignment();
18302 unsigned LoopSize = 0;
18307 LoopSize +=
MBB->getAlignment().value() / 2;
18310 LoopSize +=
TII->getInstSizeInBytes(
MI);
18311 if (LoopSize > 192)
18316 if (LoopSize <= 64)
18319 if (LoopSize <= 128)
18320 return CacheLineAlign;
18326 auto I = Exit->getFirstNonDebugInstr();
18327 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18328 return CacheLineAlign;
18337 if (PreTerm == Pre->
begin() ||
18338 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18342 auto ExitHead = Exit->getFirstNonDebugInstr();
18343 if (ExitHead == Exit->end() ||
18344 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18349 return CacheLineAlign;
18357 N =
N->getOperand(0).getNode();
18358 if (
N->getOpcode() == ISD::INLINEASM ||
N->getOpcode() == ISD::INLINEASM_BR)
18367 switch (
N->getOpcode()) {
18375 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
18376 return !
TRI->isSGPRReg(
MRI, Reg);
18382 return !
TRI->isSGPRReg(
MRI, Reg);
18386 unsigned AS = L->getAddressSpace();
18390 case ISD::CALLSEQ_END:
18396 case AMDGPUISD::ATOMIC_CMP_SWAP:
18397 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18398 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18399 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18400 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18401 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18402 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18403 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18404 case AMDGPUISD::BUFFER_ATOMIC_AND:
18405 case AMDGPUISD::BUFFER_ATOMIC_OR:
18406 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18407 case AMDGPUISD::BUFFER_ATOMIC_INC:
18408 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18409 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18410 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18411 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18412 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18418 return A->readMem() &&
A->writeMem();
18439 switch (Ty.getScalarSizeInBits()) {
18451 const APInt &DemandedElts,
18454 unsigned Depth)
const {
18455 if (
Op.getOpcode() == AMDGPUISD::CLAMP) {
18459 if (Info->getMode().DX10Clamp)
18471 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
18491 <<
"Hardware instruction generated for atomic "
18493 <<
" operation at memory scope " << MemScope;
18498 Type *EltTy = VT->getElementType();
18499 return VT->getNumElements() == 2 &&
18519 unsigned BW =
IT->getBitWidth();
18520 return BW == 32 || BW == 64;
18534 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
18535 return BW == 32 || BW == 64;
18538 if (Ty->isFloatTy() || Ty->isDoubleTy())
18542 return VT->getNumElements() == 2 &&
18543 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18553 bool HasSystemScope) {
18560 if (HasSystemScope) {
18569 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
18582 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
18608 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
18621 bool HasSystemScope =
18653 if (!
IT ||
IT->getBitWidth() != 32)
18659 if (Subtarget->hasEmulatedSystemScopeAtomics())
18675 if (!HasSystemScope &&
18676 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18688 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
18696 ConstVal && ConstVal->isNullValue())
18734 if (Ty->isFloatTy()) {
18739 if (Ty->isDoubleTy()) {
18760 if (Ty->isFloatTy() &&
18761 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18774 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18778 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
18782 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18787 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
18792 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18796 if (Ty->isFloatTy()) {
18799 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18802 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18807 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18815 if (Subtarget->hasFlatAtomicFaddF32Inst())
18824 if (Subtarget->hasLDSFPAtomicAddF32()) {
18825 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18827 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18855 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18857 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18861 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18863 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18916 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18917 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18918 : &AMDGPU::SReg_32RegClass;
18919 if (!
TRI->isSGPRClass(RC) && !isDivergent)
18920 return TRI->getEquivalentSGPRClass(RC);
18921 if (
TRI->isSGPRClass(RC) && isDivergent) {
18922 if (Subtarget->hasGFX90AInsts())
18923 return TRI->getEquivalentAVClass(RC);
18924 return TRI->getEquivalentVGPRClass(RC);
18937 unsigned WaveSize) {
18942 if (!
IT ||
IT->getBitWidth() != WaveSize)
18947 if (!Visited.
insert(V).second)
18949 bool Result =
false;
18950 for (
const auto *U : V->users()) {
18952 if (V == U->getOperand(1)) {
18957 case Intrinsic::amdgcn_if_break:
18958 case Intrinsic::amdgcn_if:
18959 case Intrinsic::amdgcn_else:
18964 if (V == U->getOperand(0)) {
18969 case Intrinsic::amdgcn_end_cf:
18970 case Intrinsic::amdgcn_loop:
18976 Result =
hasCFUser(U, Visited, WaveSize);
18985 const Value *V)
const {
18987 if (CI->isInlineAsm()) {
18996 for (
auto &TC : TargetConstraints) {
19010 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
19038 return MRI.hasOneNonDBGUse(N0);
19045 if (
I.getMetadata(
"amdgpu.noclobber"))
19047 if (
I.getMetadata(
"amdgpu.last.use"))
19111 Alignment = RMW->getAlign();
19124 bool FullFlatEmulation =
19126 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19127 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19128 RMW->getType()->isDoubleTy()));
19131 bool ReturnValueIsUsed = !AI->
use_empty();
19140 if (FullFlatEmulation) {
19151 std::prev(BB->
end())->eraseFromParent();
19152 Builder.SetInsertPoint(BB);
19154 Value *LoadedShared =
nullptr;
19155 if (FullFlatEmulation) {
19156 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19157 {Addr},
nullptr,
"is.shared");
19158 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19159 Builder.SetInsertPoint(SharedBB);
19160 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19166 LoadedShared = Clone;
19168 Builder.CreateBr(PhiBB);
19169 Builder.SetInsertPoint(CheckPrivateBB);
19172 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19173 {Addr},
nullptr,
"is.private");
19174 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19176 Builder.SetInsertPoint(PrivateBB);
19178 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19181 Value *LoadedPrivate;
19183 LoadedPrivate = Builder.CreateAlignedLoad(
19184 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
19187 LoadedPrivate, RMW->getValOperand());
19189 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19191 auto [ResultLoad, Equal] =
19197 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19200 Builder.CreateBr(PhiBB);
19202 Builder.SetInsertPoint(GlobalBB);
19206 if (FullFlatEmulation) {
19207 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19216 if (!FullFlatEmulation) {
19221 MDNode *RangeNotPrivate =
19224 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
19228 Builder.CreateBr(PhiBB);
19230 Builder.SetInsertPoint(PhiBB);
19232 if (ReturnValueIsUsed) {
19235 if (FullFlatEmulation)
19242 Builder.CreateBr(ExitBB);
19246 unsigned PtrOpIdx) {
19247 Value *PtrOp =
I->getOperand(PtrOpIdx);
19254 I->setOperand(PtrOpIdx, ASCast);
19266 ConstVal && ConstVal->isNullValue()) {
19296 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19304 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19319 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
AMDGPUArgumentUsageInfo & getArgUsageInfo()
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned AndSaveExecOpc
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf()
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
MachineFunctionAnalysisManager * getMFAM()
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
constexpr bool empty() const
empty - Check if the string is empty.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
constexpr bool isZero() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ SMULO
Same for multiplication.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
@ BRCOND
X86 conditional branches.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
unsigned AtomicNoRetBaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const