40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
52#define DEBUG_TYPE "si-lower"
58 cl::desc(
"Do not align and prefetch loops"),
62 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
63 cl::desc(
"Use indirect register addressing for divergent indexes"),
70 cl::desc(
"Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
85 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
86 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
88 return AMDGPU::SGPR0 + Reg;
203 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
204 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
205 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
206 MVT::i1, MVT::v32i32},
210 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
211 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
212 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
213 MVT::i1, MVT::v32i32},
282 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
289 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
290 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
291 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
294 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
295 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
296 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
300 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
301 MVT::v3i16, MVT::v4i16, MVT::Other},
306 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
322 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
323 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
324 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
325 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
326 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
327 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
328 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
329 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
361 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
375 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
389 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
403 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
417 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
432 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
433 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
449 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
450 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
455 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
459 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
460 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
461 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
462 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
538 {MVT::f32, MVT::f64},
Custom);
544 {MVT::f32, MVT::f64},
Legal);
641 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
642 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
643 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
776 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
777 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
778 MVT::v32f16, MVT::v32bf16},
788 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
792 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
796 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
797 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
816 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
819 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
820 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
821 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
824 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
832 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
839 {MVT::v2f16, MVT::v4f16},
Custom);
849 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
869 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
870 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
871 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
872 MVT::v32f16, MVT::v32bf16},
890 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
908 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
917 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
918 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
923 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
924 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
925 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
926 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
930 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
931 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
932 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
933 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
965 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1054 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1067 EVT DestVT,
EVT SrcVT)
const {
1079 LLT DestTy,
LLT SrcTy)
const {
1080 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
1081 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
1107 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1109 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1136 return (NumElts + 1) / 2;
1142 return NumElts * ((
Size + 31) / 32);
1151 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1160 if (ScalarVT == MVT::bf16) {
1161 RegisterVT = MVT::i32;
1162 IntermediateVT = MVT::v2bf16;
1164 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1165 IntermediateVT = RegisterVT;
1167 NumIntermediates = (NumElts + 1) / 2;
1168 return NumIntermediates;
1173 IntermediateVT = RegisterVT;
1174 NumIntermediates = NumElts;
1175 return NumIntermediates;
1178 if (Size < 16 && Subtarget->has16BitInsts()) {
1180 RegisterVT = MVT::i16;
1181 IntermediateVT = ScalarVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1187 RegisterVT = MVT::i32;
1188 IntermediateVT = ScalarVT;
1189 NumIntermediates = NumElts;
1190 return NumIntermediates;
1194 RegisterVT = MVT::i32;
1195 IntermediateVT = RegisterVT;
1196 NumIntermediates = NumElts * ((
Size + 31) / 32);
1197 return NumIntermediates;
1202 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1207 unsigned MaxNumLanes) {
1208 assert(MaxNumLanes != 0);
1211 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1212 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1223 unsigned MaxNumLanes) {
1224 auto *ST = dyn_cast<StructType>(Ty);
1229 assert(ST->getNumContainedTypes() == 2 &&
1230 ST->getContainedType(1)->isIntegerTy(32));
1244 return MVT::amdgpuBufferFatPointer;
1246 DL.getPointerSizeInBits(AS) == 192)
1247 return MVT::amdgpuBufferStridedPointer;
1256 DL.getPointerSizeInBits(AS) == 160) ||
1258 DL.getPointerSizeInBits(AS) == 192))
1265 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1266 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1268 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1269 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1271 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1272 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1274 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1275 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1285 unsigned IntrID)
const {
1287 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1305 if (RsrcIntr->IsImage) {
1313 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1320 Info.ptrVal = RsrcArg;
1323 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1332 if (RsrcIntr->IsImage) {
1333 unsigned MaxNumLanes = 4;
1348 std::numeric_limits<unsigned>::max());
1358 if (RsrcIntr->IsImage) {
1359 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1379 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1381 Info.memVT = MVT::i32;
1388 case Intrinsic::amdgcn_raw_buffer_load_lds:
1389 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1390 case Intrinsic::amdgcn_struct_buffer_load_lds:
1391 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1392 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1397 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1398 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1399 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1400 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1403 std::numeric_limits<unsigned>::max());
1413 case Intrinsic::amdgcn_ds_ordered_add:
1414 case Intrinsic::amdgcn_ds_ordered_swap: {
1427 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1428 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1431 Info.ptrVal =
nullptr;
1436 case Intrinsic::amdgcn_ds_append:
1437 case Intrinsic::amdgcn_ds_consume: {
1450 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1451 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1452 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1457 Info.memVT = MVT::i64;
1463 case Intrinsic::amdgcn_global_atomic_csub: {
1472 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1473 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1474 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1477 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1479 : cast<StructType>(CI.
getType())
1480 ->getElementType(0));
1488 case Intrinsic::amdgcn_global_atomic_fmin_num:
1489 case Intrinsic::amdgcn_global_atomic_fmax_num:
1490 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1491 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1492 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1493 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1503 case Intrinsic::amdgcn_flat_load_monitor_b32:
1504 case Intrinsic::amdgcn_flat_load_monitor_b64:
1505 case Intrinsic::amdgcn_flat_load_monitor_b128:
1506 case Intrinsic::amdgcn_global_load_monitor_b32:
1507 case Intrinsic::amdgcn_global_load_monitor_b64:
1508 case Intrinsic::amdgcn_global_load_monitor_b128:
1509 case Intrinsic::amdgcn_cluster_load_b32:
1510 case Intrinsic::amdgcn_cluster_load_b64:
1511 case Intrinsic::amdgcn_cluster_load_b128:
1512 case Intrinsic::amdgcn_ds_load_tr6_b96:
1513 case Intrinsic::amdgcn_ds_load_tr4_b64:
1514 case Intrinsic::amdgcn_ds_load_tr8_b64:
1515 case Intrinsic::amdgcn_ds_load_tr16_b128:
1516 case Intrinsic::amdgcn_global_load_tr6_b96:
1517 case Intrinsic::amdgcn_global_load_tr4_b64:
1518 case Intrinsic::amdgcn_global_load_tr_b64:
1519 case Intrinsic::amdgcn_global_load_tr_b128:
1520 case Intrinsic::amdgcn_ds_read_tr4_b64:
1521 case Intrinsic::amdgcn_ds_read_tr6_b96:
1522 case Intrinsic::amdgcn_ds_read_tr8_b64:
1523 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1531 case Intrinsic::amdgcn_ds_gws_init:
1532 case Intrinsic::amdgcn_ds_gws_barrier:
1533 case Intrinsic::amdgcn_ds_gws_sema_v:
1534 case Intrinsic::amdgcn_ds_gws_sema_br:
1535 case Intrinsic::amdgcn_ds_gws_sema_p:
1536 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1546 Info.memVT = MVT::i32;
1550 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1556 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1557 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1558 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1559 case Intrinsic::amdgcn_global_load_async_to_lds_b128: {
1566 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1567 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1568 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1569 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1576 case Intrinsic::amdgcn_load_to_lds:
1577 case Intrinsic::amdgcn_global_load_lds: {
1579 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1585 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1586 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1587 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1588 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1598 Info.memVT = MVT::i32;
1605 case Intrinsic::amdgcn_s_prefetch_data:
1606 case Intrinsic::amdgcn_flat_prefetch:
1607 case Intrinsic::amdgcn_global_prefetch: {
1622 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1625 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1626 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1638 Type *&AccessTy)
const {
1640 switch (
II->getIntrinsicID()) {
1641 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1642 case Intrinsic::amdgcn_cluster_load_b128:
1643 case Intrinsic::amdgcn_cluster_load_b64:
1644 case Intrinsic::amdgcn_cluster_load_b32:
1645 case Intrinsic::amdgcn_ds_append:
1646 case Intrinsic::amdgcn_ds_consume:
1647 case Intrinsic::amdgcn_ds_load_tr8_b64:
1648 case Intrinsic::amdgcn_ds_load_tr16_b128:
1649 case Intrinsic::amdgcn_ds_load_tr4_b64:
1650 case Intrinsic::amdgcn_ds_load_tr6_b96:
1651 case Intrinsic::amdgcn_ds_read_tr4_b64:
1652 case Intrinsic::amdgcn_ds_read_tr6_b96:
1653 case Intrinsic::amdgcn_ds_read_tr8_b64:
1654 case Intrinsic::amdgcn_ds_read_tr16_b64:
1655 case Intrinsic::amdgcn_ds_ordered_add:
1656 case Intrinsic::amdgcn_ds_ordered_swap:
1657 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1658 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1659 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1660 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1661 case Intrinsic::amdgcn_flat_load_monitor_b128:
1662 case Intrinsic::amdgcn_flat_load_monitor_b32:
1663 case Intrinsic::amdgcn_flat_load_monitor_b64:
1664 case Intrinsic::amdgcn_global_atomic_csub:
1665 case Intrinsic::amdgcn_global_atomic_fmax_num:
1666 case Intrinsic::amdgcn_global_atomic_fmin_num:
1667 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1668 case Intrinsic::amdgcn_global_load_monitor_b128:
1669 case Intrinsic::amdgcn_global_load_monitor_b32:
1670 case Intrinsic::amdgcn_global_load_monitor_b64:
1671 case Intrinsic::amdgcn_global_load_tr_b64:
1672 case Intrinsic::amdgcn_global_load_tr_b128:
1673 case Intrinsic::amdgcn_global_load_tr4_b64:
1674 case Intrinsic::amdgcn_global_load_tr6_b96:
1675 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1676 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1677 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1678 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1679 Ptr =
II->getArgOperand(0);
1681 case Intrinsic::amdgcn_load_to_lds:
1682 case Intrinsic::amdgcn_global_load_lds:
1683 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1684 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1685 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1686 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1687 Ptr =
II->getArgOperand(1);
1692 AccessTy =
II->getType();
1698 unsigned AddrSpace)
const {
1710 return AM.
Scale == 0 &&
1712 AM.
BaseOffs, AddrSpace, FlatVariant));
1732 return isLegalMUBUFAddressingMode(AM);
1735bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1746 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1758 if (AM.HasBaseReg) {
1790 return isLegalMUBUFAddressingMode(AM);
1797 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1847 : isLegalMUBUFAddressingMode(AM);
1894 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1906 Align RequiredAlignment(
1909 Alignment < RequiredAlignment)
1930 RequiredAlignment =
Align(4);
1948 *IsFast = (Alignment >= RequiredAlignment) ? 64
1949 : (Alignment <
Align(4)) ? 32
1971 *IsFast = (Alignment >= RequiredAlignment) ? 96
1972 : (Alignment <
Align(4)) ? 32
1985 RequiredAlignment =
Align(8);
1996 *IsFast = (Alignment >= RequiredAlignment) ? 128
1997 : (Alignment <
Align(4)) ? 32
2014 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2016 return Alignment >= RequiredAlignment ||
2025 bool AlignedBy4 = Alignment >=
Align(4);
2027 *IsFast = AlignedBy4;
2038 return Alignment >=
Align(4) ||
2066 return Size >= 32 && Alignment >=
Align(4);
2071 unsigned *IsFast)
const {
2073 Alignment, Flags, IsFast);
2084 if (
Op.size() >= 16 &&
2088 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2096 const MemSDNode *MemNode = cast<MemSDNode>(
N);
2106 unsigned DestAS)
const {
2121 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2139 unsigned Index)
const {
2182 auto [InputPtrReg, RC, ArgTy] =
2192 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2198 const SDLoc &SL)
const {
2205 const SDLoc &SL)
const {
2208 std::optional<uint32_t> KnownSize =
2210 if (KnownSize.has_value())
2236 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2245SDValue SITargetLowering::lowerKernargMemParameter(
2257 int64_t OffsetDiff =
Offset - AlignDownOffset;
2263 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2273 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2283 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2331 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2336SDValue SITargetLowering::getPreloadedValue(
2359 Reg = &WorkGroupIDX;
2360 RC = &AMDGPU::SReg_32RegClass;
2364 Reg = &WorkGroupIDY;
2365 RC = &AMDGPU::SReg_32RegClass;
2369 Reg = &WorkGroupIDZ;
2370 RC = &AMDGPU::SReg_32RegClass;
2401 for (
unsigned I = 0, E = Ins.size(), PSInputNum = 0;
I != E; ++
I) {
2405 "vector type argument should have been split");
2410 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2418 "unexpected vector split in ps argument type");
2432 Info->markPSInputAllocated(PSInputNum);
2434 Info->markPSInputEnabled(PSInputNum);
2450 if (
Info.hasWorkItemIDX()) {
2460 if (
Info.hasWorkItemIDY()) {
2463 Info.setWorkItemIDY(
2466 unsigned Reg = AMDGPU::VGPR1;
2474 if (
Info.hasWorkItemIDZ()) {
2477 Info.setWorkItemIDZ(
2480 unsigned Reg = AMDGPU::VGPR2;
2500 if (RegIdx == ArgVGPRs.
size()) {
2507 unsigned Reg = ArgVGPRs[RegIdx];
2509 assert(Reg != AMDGPU::NoRegister);
2519 unsigned NumArgRegs) {
2522 if (RegIdx == ArgSGPRs.
size())
2525 unsigned Reg = ArgSGPRs[RegIdx];
2527 assert(Reg != AMDGPU::NoRegister);
2541 assert(Reg != AMDGPU::NoRegister);
2567 const unsigned Mask = 0x3ff;
2570 if (
Info.hasWorkItemIDX()) {
2572 Info.setWorkItemIDX(Arg);
2575 if (
Info.hasWorkItemIDY()) {
2577 Info.setWorkItemIDY(Arg);
2580 if (
Info.hasWorkItemIDZ())
2592 const unsigned Mask = 0x3ff;
2613 if (
Info.hasImplicitArgPtr())
2621 if (
Info.hasWorkGroupIDX())
2624 if (
Info.hasWorkGroupIDY())
2627 if (
Info.hasWorkGroupIDZ())
2630 if (
Info.hasLDSKernelId())
2642 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2649 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2655 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2661 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2676 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2682 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2688 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2705 bool InPreloadSequence =
true;
2707 bool AlignedForImplictArgs =
false;
2708 unsigned ImplicitArgOffset = 0;
2709 for (
auto &Arg :
F.args()) {
2710 if (!InPreloadSequence || !Arg.hasInRegAttr())
2713 unsigned ArgIdx = Arg.getArgNo();
2716 if (InIdx < Ins.size() &&
2717 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2720 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2721 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2723 assert(ArgLocs[ArgIdx].isMemLoc());
2724 auto &ArgLoc = ArgLocs[InIdx];
2726 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2728 unsigned NumAllocSGPRs =
2729 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2732 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2733 if (!AlignedForImplictArgs) {
2735 alignTo(LastExplicitArgOffset,
2737 LastExplicitArgOffset;
2738 AlignedForImplictArgs =
true;
2740 ArgOffset += ImplicitArgOffset;
2744 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2745 assert(InIdx >= 1 &&
"No previous SGPR");
2746 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2747 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2751 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2752 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2755 InPreloadSequence =
false;
2761 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2763 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2765 if (PreloadRegs->
size() > 1)
2766 RC = &AMDGPU::SGPR_32RegClass;
2767 for (
auto &Reg : *PreloadRegs) {
2773 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2782 if (
Info.hasLDSKernelId()) {
2784 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2793 bool IsShader)
const {
2801 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2803 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2807 unsigned NumRequiredSystemSGPRs =
2808 Info.hasWorkGroupIDX() +
Info.hasWorkGroupIDY() +
2809 Info.hasWorkGroupIDZ() +
Info.hasWorkGroupInfo();
2810 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2812 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2817 if (!HasArchitectedSGPRs) {
2818 if (
Info.hasWorkGroupIDX()) {
2820 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2824 if (
Info.hasWorkGroupIDY()) {
2826 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2830 if (
Info.hasWorkGroupIDZ()) {
2832 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2837 if (
Info.hasWorkGroupInfo()) {
2839 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2843 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2845 unsigned PrivateSegmentWaveByteOffsetReg;
2848 PrivateSegmentWaveByteOffsetReg =
2849 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2853 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2855 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2858 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2860 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2861 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2865 Info.getNumPreloadedSGPRs() >= 16);
2880 if (HasStackObjects)
2881 Info.setHasNonSpillStackObjects(
true);
2886 HasStackObjects =
true;
2890 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2892 if (!ST.enableFlatScratch()) {
2893 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2900 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2902 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2912 Info.setScratchRSrcReg(ReservedBufferReg);
2931 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2932 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2939 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2940 if (!
MRI.isLiveIn(Reg)) {
2941 Info.setStackPtrOffsetReg(Reg);
2946 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2953 if (ST.getFrameLowering()->hasFP(MF)) {
2954 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2960 return !
Info->isEntryFunction();
2970 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2979 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2980 RC = &AMDGPU::SGPR_64RegClass;
2981 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2982 RC = &AMDGPU::SGPR_32RegClass;
2988 Entry->addLiveIn(*
I);
2993 for (
auto *Exit : Exits)
2995 TII->get(TargetOpcode::COPY), *
I)
3010 bool IsError =
false;
3014 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3032 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
3033 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
3042 !
Info->hasWorkGroupIDZ());
3045 bool IsWholeWaveFunc =
Info->isWholeWaveFunction();
3063 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
3064 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
3067 Info->markPSInputAllocated(0);
3068 Info->markPSInputEnabled(0);
3079 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
3080 if ((PsInputBits & 0x7F) == 0 ||
3081 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3084 }
else if (IsKernel) {
3087 Splits.
append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3101 }
else if (!IsGraphics) {
3118 Info->setNumWaveDispatchSGPRs(
3120 Info->setNumWaveDispatchVGPRs(
3122 }
else if (
Info->getNumKernargPreloadedSGPRs()) {
3123 Info->setNumWaveDispatchSGPRs(
Info->getNumUserSGPRs());
3128 if (IsWholeWaveFunc) {
3130 {MVT::i1, MVT::Other}, Chain);
3142 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3153 if (IsEntryFunc && VA.
isMemLoc()) {
3176 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
3180 int64_t OffsetDiff =
Offset - AlignDownOffset;
3187 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3198 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3199 Ins[i].Flags.isSExt(), &Ins[i]);
3207 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3210 if (PreloadRegs.
size() == 1) {
3211 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3216 TRI->getRegSizeInBits(*RC)));
3224 for (
auto Reg : PreloadRegs) {
3231 PreloadRegs.size()),
3248 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3249 Ins[i].Flags.isSExt(), &Ins[i]);
3261 "hidden argument in kernel signature was not preloaded",
3267 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3268 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3273 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
3288 if (!IsEntryFunc && VA.
isMemLoc()) {
3289 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3300 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3301 RC = &AMDGPU::VGPR_32RegClass;
3302 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3303 RC = &AMDGPU::SGPR_32RegClass;
3363 Info->setBytesInStackArgArea(StackArgSize);
3365 return Chains.
empty() ? Chain
3388 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3389 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3390 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3414 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
3433 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3434 ++
I, ++RealRVLocIdx) {
3438 SDValue Arg = OutVals[RealRVLocIdx];
3461 ReadFirstLane, Arg);
3468 if (!
Info->isEntryFunction()) {
3474 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3476 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3577 auto &ArgUsageInfo =
3579 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3605 const auto [OutgoingArg, ArgRC, ArgTy] =
3610 const auto [IncomingArg, IncomingArgRC, Ty] =
3612 assert(IncomingArgRC == ArgRC);
3615 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3623 InputReg = getImplicitArgPtr(DAG,
DL);
3625 std::optional<uint32_t> Id =
3627 if (Id.has_value()) {
3638 if (OutgoingArg->isRegister()) {
3639 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3640 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3643 unsigned SpecialArgOffset =
3654 auto [OutgoingArg, ArgRC, Ty] =
3657 std::tie(OutgoingArg, ArgRC, Ty) =
3660 std::tie(OutgoingArg, ArgRC, Ty) =
3675 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3676 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3677 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3709 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3710 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3721 : IncomingArgY ? *IncomingArgY
3728 if (OutgoingArg->isRegister()) {
3730 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3756 if (Callee->isDivergent())
3763 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3767 if (!CallerPreserved)
3770 bool CCMatch = CallerCC == CalleeCC;
3783 if (Arg.hasByValAttr())
3797 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3798 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3807 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3820 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
3822 if (!CCVA.isRegLoc())
3827 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3829 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
3853enum ChainCallArgIdx {
3875 bool UsesDynamicVGPRs =
false;
3876 if (IsChainCallConv) {
3881 auto RequestedExecIt =
3883 return Arg.OrigArgIndex == 2;
3885 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
3887 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
3890 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
3893 "Haven't popped all the special args");
3896 CLI.
Args[ChainCallArgIdx::Exec];
3903 if (
const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
3905 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
3907 ChainCallSpecialArgs.
push_back(Arg.Node);
3910 PushNodeOrTargetConstant(RequestedExecArg);
3915 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
3916 if (FlagsValue.
isZero()) {
3917 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
3919 "no additional args allowed if flags == 0");
3921 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
3927 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
3930 UsesDynamicVGPRs =
true;
3931 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
3932 CLI.
Args.end(), PushNodeOrTargetConstant);
3941 bool IsSibCall =
false;
3955 "unsupported call to variadic function ");
3963 "unsupported required tail call to function ");
3968 Outs, OutVals, Ins, DAG);
3972 "site marked musttail or on llvm.amdgcn.cs.chain");
3979 if (!TailCallOpt && IsTailCall)
4026 if (!IsSibCall || IsChainCallConv) {
4033 RegsToPass.emplace_back(IsChainCallConv
4034 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4035 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4042 const unsigned NumSpecialInputs = RegsToPass.size();
4044 MVT PtrVT = MVT::i32;
4047 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4075 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4083 int32_t
Offset = LocMemOffset;
4090 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4096 ? Flags.getNonZeroByValAlign()
4123 if (Outs[i].Flags.isByVal()) {
4125 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4128 Outs[i].Flags.getNonZeroByValAlign(),
4130 nullptr, std::nullopt, DstInfo,
4136 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4142 if (!MemOpChains.
empty())
4158 unsigned ArgIdx = 0;
4159 for (
auto [Reg, Val] : RegsToPass) {
4160 if (ArgIdx++ >= NumSpecialInputs &&
4161 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4187 if (IsTailCall && !IsSibCall) {
4192 std::vector<SDValue> Ops({Chain});
4198 Ops.push_back(Callee);
4215 Ops.push_back(Callee);
4226 if (IsChainCallConv)
4231 for (
auto &[Reg, Val] : RegsToPass)
4235 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4236 assert(Mask &&
"Missing call preserved mask for calling convention");
4246 MVT::Glue, GlueOps),
4251 Ops.push_back(InGlue);
4269 return DAG.
getNode(OPC,
DL, MVT::Other, Ops);
4274 Chain = Call.getValue(0);
4275 InGlue = Call.getValue(1);
4277 uint64_t CalleePopBytes = NumBytes;
4298 EVT VT =
Op.getValueType();
4308 Align Alignment = cast<ConstantSDNode>(
Op.getOperand(2))->getAlignValue();
4312 "Stack grows upwards for AMDGPU");
4314 Chain = BaseAddr.getValue(1);
4316 if (Alignment > StackAlign) {
4319 uint64_t StackAlignMask = ScaledAlignment - 1;
4326 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4328 if (isa<ConstantSDNode>(
Size)) {
4359 if (
Op.getValueType() != MVT::i32)
4378 assert(
Op.getValueType() == MVT::i32);
4387 Op.getOperand(0), IntrinID, GetRoundBothImm);
4421 SDValue RoundModeTimesNumBits =
4441 TableEntry, EnumOffset);
4455 if (
auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4457 static_cast<uint32_t>(ConstMode->getZExtValue()),
4469 if (UseReducedTable) {
4475 SDValue RoundModeTimesNumBits =
4495 SDValue RoundModeTimesNumBits =
4504 NewMode = TruncTable;
4513 ReadFirstLaneID, NewMode);
4526 IntrinID, RoundBothImm, NewMode);
4532 if (
Op->isDivergent() &&
4560 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4561 EVT SrcVT = Src.getValueType();
4570 EVT DstVT =
Op.getValueType();
4579 if (
Op.getValueType() != MVT::i64)
4593 Op.getOperand(0), IntrinID, ModeHwRegImm);
4595 Op.getOperand(0), IntrinID, TrapHwRegImm);
4609 if (
Op.getOperand(1).getValueType() != MVT::i64)
4621 ReadFirstLaneID, NewModeReg);
4623 ReadFirstLaneID, NewTrapReg);
4625 unsigned ModeHwReg =
4628 unsigned TrapHwReg =
4636 IntrinID, ModeHwRegImm, NewModeReg);
4639 IntrinID, TrapHwRegImm, NewTrapReg);
4648 .
Case(
"m0", AMDGPU::M0)
4649 .
Case(
"exec", AMDGPU::EXEC)
4650 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4651 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4652 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4653 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4654 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4662 "\" for subtarget."));
4667 case AMDGPU::EXEC_LO:
4668 case AMDGPU::EXEC_HI:
4669 case AMDGPU::FLAT_SCR_LO:
4670 case AMDGPU::FLAT_SCR_HI:
4675 case AMDGPU::FLAT_SCR:
4694 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4703static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4725 auto Next = std::next(
I);
4738 return std::pair(LoopBB, RemainderBB);
4745 auto I =
MI.getIterator();
4746 auto E = std::next(
I);
4768 Src->setIsKill(
false);
4778 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
4784 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4787 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4811 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4812 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4821 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4822 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4824 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4825 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4833 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4840 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4844 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4850 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4851 : AMDGPU::S_AND_SAVEEXEC_B64),
4855 MRI.setSimpleHint(NewExec, CondReg);
4857 if (UseGPRIdxMode) {
4859 SGPRIdxReg = CurrentIdxReg;
4861 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4862 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4872 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4879 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4882 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4883 : AMDGPU::S_XOR_B64_term),
4907 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4908 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4916 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
4918 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4919 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4920 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4921 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4936 InitResultReg, DstReg, PhiReg, TmpExec,
4937 Offset, UseGPRIdxMode, SGPRIdxReg);
4943 LoopBB->removeSuccessor(RemainderBB);
4945 LoopBB->addSuccessor(LandingPad);
4956static std::pair<unsigned, int>
4960 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
4965 return std::pair(AMDGPU::sub0,
Offset);
4979 assert(
Idx->getReg() != AMDGPU::NoRegister);
5003 return Idx->getReg();
5005 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5022 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5023 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5032 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5035 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5039 if (UseGPRIdxMode) {
5046 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5059 MI.eraseFromParent();
5068 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5069 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5075 UseGPRIdxMode, SGPRIdxReg);
5079 if (UseGPRIdxMode) {
5081 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5083 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5088 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5093 MI.eraseFromParent();
5110 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5120 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5122 if (
Idx->getReg() == AMDGPU::NoRegister) {
5133 MI.eraseFromParent();
5138 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5142 if (UseGPRIdxMode) {
5146 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5155 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5156 TRI.getRegSizeInBits(*VecRC), 32,
false);
5162 MI.eraseFromParent();
5172 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5176 UseGPRIdxMode, SGPRIdxReg);
5179 if (UseGPRIdxMode) {
5181 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5183 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5189 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5190 TRI.getRegSizeInBits(*VecRC), 32,
false);
5191 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5197 MI.eraseFromParent();
5203 case AMDGPU::S_MIN_U32:
5204 return std::numeric_limits<uint32_t>::max();
5205 case AMDGPU::S_MIN_I32:
5206 return std::numeric_limits<int32_t>::max();
5207 case AMDGPU::S_MAX_U32:
5208 return std::numeric_limits<uint32_t>::min();
5209 case AMDGPU::S_MAX_I32:
5210 return std::numeric_limits<int32_t>::min();
5211 case AMDGPU::S_ADD_I32:
5212 case AMDGPU::S_SUB_I32:
5213 case AMDGPU::S_OR_B32:
5214 case AMDGPU::S_XOR_B32:
5215 return std::numeric_limits<uint32_t>::min();
5216 case AMDGPU::S_AND_B32:
5217 return std::numeric_limits<uint32_t>::max();
5234 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5239 case AMDGPU::S_MIN_U32:
5240 case AMDGPU::S_MIN_I32:
5241 case AMDGPU::S_MAX_U32:
5242 case AMDGPU::S_MAX_I32:
5243 case AMDGPU::S_AND_B32:
5244 case AMDGPU::S_OR_B32: {
5250 case AMDGPU::S_XOR_B32:
5251 case AMDGPU::S_ADD_I32:
5252 case AMDGPU::S_SUB_I32: {
5255 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5256 Register ActiveLanes =
MRI.createVirtualRegister(DstRegClass);
5258 bool IsWave32 = ST.isWave32();
5259 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5260 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5262 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5267 auto NewAccumulator =
BuildMI(BB,
MI,
DL,
TII->get(CountReg), ActiveLanes)
5268 .
addReg(Exec->getOperand(0).getReg());
5271 case AMDGPU::S_XOR_B32: {
5276 Register ParityRegister =
MRI.createVirtualRegister(DstRegClass);
5280 .
addReg(NewAccumulator->getOperand(0).getReg())
5284 .
addReg(ParityReg->getOperand(0).getReg());
5287 case AMDGPU::S_SUB_I32: {
5288 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5291 auto InvertedValReg =
5296 .
addReg(InvertedValReg->getOperand(0).getReg())
5297 .
addReg(NewAccumulator->getOperand(0).getReg());
5300 case AMDGPU::S_ADD_I32: {
5303 .
addReg(NewAccumulator->getOperand(0).getReg());
5330 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5331 Register InitalValReg =
MRI.createVirtualRegister(DstRegClass);
5333 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5334 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5335 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5337 Register FF1Reg =
MRI.createVirtualRegister(DstRegClass);
5339 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5341 bool IsWave32 = ST.isWave32();
5342 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5343 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5350 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5358 I = ComputeLoop->end();
5360 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5364 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5365 .
addReg(TmpSReg->getOperand(0).getReg())
5369 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5370 auto FF1 =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(SFFOpc), FF1Reg)
5371 .
addReg(ActiveBits->getOperand(0).getReg());
5372 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5373 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5375 .
addReg(FF1->getOperand(0).getReg());
5378 .
addReg(LaneValue->getOperand(0).getReg());
5381 unsigned BITSETOpc =
5382 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5383 auto NewActiveBits =
5384 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5385 .
addReg(FF1->getOperand(0).getReg())
5386 .
addReg(ActiveBits->getOperand(0).getReg());
5389 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5390 .addMBB(ComputeLoop);
5391 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5392 .addMBB(ComputeLoop);
5395 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5397 .
addReg(NewActiveBits->getOperand(0).getReg())
5399 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5404 MI.eraseFromParent();
5416 switch (
MI.getOpcode()) {
5417 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5419 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5421 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5423 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5425 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5427 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5429 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5431 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5433 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5435 case AMDGPU::S_UADDO_PSEUDO:
5436 case AMDGPU::S_USUBO_PSEUDO: {
5443 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5445 : AMDGPU::S_SUB_I32;
5456 MI.eraseFromParent();
5459 case AMDGPU::S_ADD_U64_PSEUDO:
5460 case AMDGPU::S_SUB_U64_PSEUDO: {
5469 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5471 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5481 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5482 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5485 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5487 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5490 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5492 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5494 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5495 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5508 MI.eraseFromParent();
5511 case AMDGPU::V_ADD_U64_PSEUDO:
5512 case AMDGPU::V_SUB_U64_PSEUDO: {
5518 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5524 if (ST.hasAddSubU64Insts()) {
5526 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5527 : AMDGPU::V_SUB_U64_e64),
5532 TII->legalizeOperands(*
I);
5533 MI.eraseFromParent();
5537 if (IsAdd && ST.hasLshlAddU64Inst()) {
5543 TII->legalizeOperands(*
Add);
5544 MI.eraseFromParent();
5548 const auto *CarryRC =
TRI->getWaveMaskRegClass();
5550 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5551 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5553 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
5554 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
5558 : &AMDGPU::VReg_64RegClass;
5561 : &AMDGPU::VReg_64RegClass;
5564 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5566 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5569 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5571 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5574 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5576 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5579 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5586 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5600 TII->legalizeOperands(*LoHalf);
5601 TII->legalizeOperands(*HiHalf);
5602 MI.eraseFromParent();
5605 case AMDGPU::S_ADD_CO_PSEUDO:
5606 case AMDGPU::S_SUB_CO_PSEUDO: {
5620 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5621 ? AMDGPU::S_ADDC_U32
5622 : AMDGPU::S_SUBB_U32;
5624 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5625 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5630 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5631 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5635 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5637 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5643 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
5644 assert(WaveSize == 64 || WaveSize == 32);
5646 if (WaveSize == 64) {
5647 if (ST.hasScalarCompareEq64()) {
5653 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5655 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5657 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5658 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5660 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
5681 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5687 MI.eraseFromParent();
5690 case AMDGPU::SI_INIT_M0: {
5693 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
5696 MI.eraseFromParent();
5699 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
5702 TII->get(AMDGPU::S_CMP_EQ_U32))
5707 case AMDGPU::GET_GROUPSTATICSIZE: {
5712 .
add(
MI.getOperand(0))
5714 MI.eraseFromParent();
5717 case AMDGPU::GET_SHADERCYCLESHILO: {
5731 using namespace AMDGPU::Hwreg;
5732 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5734 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5735 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5737 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5738 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5740 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5744 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5749 .
add(
MI.getOperand(0))
5754 MI.eraseFromParent();
5757 case AMDGPU::SI_INDIRECT_SRC_V1:
5758 case AMDGPU::SI_INDIRECT_SRC_V2:
5759 case AMDGPU::SI_INDIRECT_SRC_V4:
5760 case AMDGPU::SI_INDIRECT_SRC_V8:
5761 case AMDGPU::SI_INDIRECT_SRC_V9:
5762 case AMDGPU::SI_INDIRECT_SRC_V10:
5763 case AMDGPU::SI_INDIRECT_SRC_V11:
5764 case AMDGPU::SI_INDIRECT_SRC_V12:
5765 case AMDGPU::SI_INDIRECT_SRC_V16:
5766 case AMDGPU::SI_INDIRECT_SRC_V32:
5768 case AMDGPU::SI_INDIRECT_DST_V1:
5769 case AMDGPU::SI_INDIRECT_DST_V2:
5770 case AMDGPU::SI_INDIRECT_DST_V4:
5771 case AMDGPU::SI_INDIRECT_DST_V8:
5772 case AMDGPU::SI_INDIRECT_DST_V9:
5773 case AMDGPU::SI_INDIRECT_DST_V10:
5774 case AMDGPU::SI_INDIRECT_DST_V11:
5775 case AMDGPU::SI_INDIRECT_DST_V12:
5776 case AMDGPU::SI_INDIRECT_DST_V16:
5777 case AMDGPU::SI_INDIRECT_DST_V32:
5779 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5780 case AMDGPU::SI_KILL_I1_PSEUDO:
5782 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5791 Register SrcCond =
MI.getOperand(3).getReg();
5793 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5794 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5795 const auto *CondRC =
TRI->getWaveMaskRegClass();
5796 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
5800 : &AMDGPU::VReg_64RegClass;
5803 : &AMDGPU::VReg_64RegClass;
5806 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5808 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5811 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5813 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5816 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5818 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5839 MI.eraseFromParent();
5842 case AMDGPU::SI_BR_UNDEF: {
5846 .
add(
MI.getOperand(0));
5848 MI.eraseFromParent();
5851 case AMDGPU::ADJCALLSTACKUP:
5852 case AMDGPU::ADJCALLSTACKDOWN: {
5859 case AMDGPU::SI_CALL_ISEL: {
5863 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
5866 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5872 MI.eraseFromParent();
5875 case AMDGPU::V_ADD_CO_U32_e32:
5876 case AMDGPU::V_SUB_CO_U32_e32:
5877 case AMDGPU::V_SUBREV_CO_U32_e32: {
5880 unsigned Opc =
MI.getOpcode();
5882 bool NeedClampOperand =
false;
5883 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
5885 NeedClampOperand =
true;
5889 if (
TII->isVOP3(*
I)) {
5894 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
5895 if (NeedClampOperand)
5898 TII->legalizeOperands(*
I);
5900 MI.eraseFromParent();
5903 case AMDGPU::V_ADDC_U32_e32:
5904 case AMDGPU::V_SUBB_U32_e32:
5905 case AMDGPU::V_SUBBREV_U32_e32:
5908 TII->legalizeOperands(
MI);
5910 case AMDGPU::DS_GWS_INIT:
5911 case AMDGPU::DS_GWS_SEMA_BR:
5912 case AMDGPU::DS_GWS_BARRIER:
5913 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
5915 case AMDGPU::DS_GWS_SEMA_V:
5916 case AMDGPU::DS_GWS_SEMA_P:
5917 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5925 case AMDGPU::S_SETREG_B32: {
5940 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5941 const unsigned SetMask = WidthMask <<
Offset;
5944 unsigned SetDenormOp = 0;
5945 unsigned SetRoundOp = 0;
5953 SetRoundOp = AMDGPU::S_ROUND_MODE;
5954 SetDenormOp = AMDGPU::S_DENORM_MODE;
5956 SetRoundOp = AMDGPU::S_ROUND_MODE;
5958 SetDenormOp = AMDGPU::S_DENORM_MODE;
5961 if (SetRoundOp || SetDenormOp) {
5964 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5965 unsigned ImmVal = Def->getOperand(1).getImm();
5979 MI.eraseFromParent();
5988 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
5992 case AMDGPU::S_INVERSE_BALLOT_U32:
5993 case AMDGPU::S_INVERSE_BALLOT_U64:
5996 MI.setDesc(
TII->get(AMDGPU::COPY));
5998 case AMDGPU::ENDPGM_TRAP: {
6001 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6021 MI.eraseFromParent();
6024 case AMDGPU::SIMULATED_TRAP: {
6028 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6029 MI.eraseFromParent();
6032 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6038 Register OriginalExec = Setup->getOperand(0).getReg();
6039 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6041 MI.getOperand(0).setReg(OriginalExec);
6078 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6166 EVT VT =
N->getValueType(0);
6170 if (VT == MVT::f16) {
6186 unsigned Opc =
Op.getOpcode();
6187 EVT VT =
Op.getValueType();
6188 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6189 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6190 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6191 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6206 unsigned Opc =
Op.getOpcode();
6207 EVT VT =
Op.getValueType();
6208 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6209 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6210 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6211 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6212 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6213 VT == MVT::v32bf16);
6221 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6223 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6230 unsigned Opc =
Op.getOpcode();
6231 EVT VT =
Op.getValueType();
6232 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6233 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6234 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6235 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6236 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6237 VT == MVT::v32bf16);
6242 : std::pair(Op0, Op0);
6251 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
6253 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
6259 switch (
Op.getOpcode()) {
6263 return LowerBRCOND(
Op, DAG);
6265 return LowerRETURNADDR(
Op, DAG);
6268 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6269 "Load should return a value and a chain");
6273 EVT VT =
Op.getValueType();
6275 return lowerFSQRTF32(
Op, DAG);
6277 return lowerFSQRTF64(
Op, DAG);
6282 return LowerTrig(
Op, DAG);
6284 return LowerSELECT(
Op, DAG);
6286 return LowerFDIV(
Op, DAG);
6288 return LowerFFREXP(
Op, DAG);
6290 return LowerATOMIC_CMP_SWAP(
Op, DAG);
6292 return LowerSTORE(
Op, DAG);
6296 return LowerGlobalAddress(MFI,
Op, DAG);
6299 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
6301 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
6303 return LowerINTRINSIC_VOID(
Op, DAG);
6305 return lowerADDRSPACECAST(
Op, DAG);
6307 return lowerINSERT_SUBVECTOR(
Op, DAG);
6309 return lowerINSERT_VECTOR_ELT(
Op, DAG);
6311 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
6313 return lowerVECTOR_SHUFFLE(
Op, DAG);
6315 return lowerSCALAR_TO_VECTOR(
Op, DAG);
6317 return lowerBUILD_VECTOR(
Op, DAG);
6320 return lowerFP_ROUND(
Op, DAG);
6322 return lowerTRAP(
Op, DAG);
6324 return lowerDEBUGTRAP(
Op, DAG);
6333 return lowerFMINNUM_FMAXNUM(
Op, DAG);
6336 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
6339 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
6342 return lowerFLDEXP(
Op, DAG);
6367 return lowerFCOPYSIGN(
Op, DAG);
6369 return lowerMUL(
Op, DAG);
6372 return lowerXMULO(
Op, DAG);
6375 return lowerXMUL_LOHI(
Op, DAG);
6408 EVT FittingLoadVT = LoadVT;
6440SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6443 bool IsIntrinsic)
const {
6447 EVT LoadVT =
M->getValueType(0);
6449 EVT EquivLoadVT = LoadVT;
6467 M->getMemoryVT(),
M->getMemOperand());
6478 EVT LoadVT =
M->getValueType(0);
6484 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
6485 bool IsTFE =
M->getNumValues() == 3;
6498 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
M->getMemOperand(),
6502 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(), Ops, IntVT,
6503 M->getMemOperand(), DAG);
6508 SDValue MemNode = getMemIntrinsicNode(
Opc,
DL, VTList, Ops, CastVT,
6509 M->getMemOperand(), DAG);
6517 EVT VT =
N->getValueType(0);
6518 unsigned CondCode =
N->getConstantOperandVal(3);
6529 EVT CmpVT =
LHS.getValueType();
6530 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6531 unsigned PromoteOp =
6551 EVT VT =
N->getValueType(0);
6553 unsigned CondCode =
N->getConstantOperandVal(3);
6562 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
6580 EVT VT =
N->getValueType(0);
6587 Src.getOperand(1), Src.getOperand(2));
6598 Exec = AMDGPU::EXEC_LO;
6600 Exec = AMDGPU::EXEC;
6617 EVT VT =
N->getValueType(0);
6619 unsigned IID =
N->getConstantOperandVal(0);
6620 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6621 IID == Intrinsic::amdgcn_permlanex16;
6622 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6623 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6627 unsigned SplitSize = 32;
6628 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6629 ST->hasDPALU_DPP() &&
6637 case Intrinsic::amdgcn_permlane16:
6638 case Intrinsic::amdgcn_permlanex16:
6639 case Intrinsic::amdgcn_update_dpp:
6644 case Intrinsic::amdgcn_writelane:
6647 case Intrinsic::amdgcn_readlane:
6648 case Intrinsic::amdgcn_set_inactive:
6649 case Intrinsic::amdgcn_set_inactive_chain_arg:
6650 case Intrinsic::amdgcn_mov_dpp8:
6653 case Intrinsic::amdgcn_readfirstlane:
6654 case Intrinsic::amdgcn_permlane64:
6664 if (
SDNode *GL =
N->getGluedNode()) {
6666 GL = GL->getOperand(0).getNode();
6676 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6677 IID == Intrinsic::amdgcn_mov_dpp8 ||
6678 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6679 Src1 =
N->getOperand(2);
6680 if (IID == Intrinsic::amdgcn_writelane ||
6681 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6682 Src2 =
N->getOperand(3);
6685 if (ValSize == SplitSize) {
6695 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6700 if (IID == Intrinsic::amdgcn_writelane) {
6705 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6707 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
6710 if (ValSize % SplitSize != 0)
6714 EVT VT =
N->getValueType(0);
6718 unsigned NumOperands =
N->getNumOperands();
6720 SDNode *GL =
N->getGluedNode();
6725 for (
unsigned i = 0; i != NE; ++i) {
6726 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6728 SDValue Operand =
N->getOperand(j);
6758 if (SplitSize == 32) {
6760 return unrollLaneOp(LaneOp.
getNode());
6766 unsigned SubVecNumElt =
6770 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6771 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6775 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6780 if (IID == Intrinsic::amdgcn_writelane)
6785 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6786 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6787 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6788 EltIdx += SubVecNumElt;
6802 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6805 if (IID == Intrinsic::amdgcn_writelane)
6808 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6816 switch (
N->getOpcode()) {
6828 unsigned IID =
N->getConstantOperandVal(0);
6830 case Intrinsic::amdgcn_make_buffer_rsrc:
6831 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
6833 case Intrinsic::amdgcn_cvt_pkrtz: {
6842 case Intrinsic::amdgcn_cvt_pknorm_i16:
6843 case Intrinsic::amdgcn_cvt_pknorm_u16:
6844 case Intrinsic::amdgcn_cvt_pk_i16:
6845 case Intrinsic::amdgcn_cvt_pk_u16: {
6851 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6853 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6855 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6860 EVT VT =
N->getValueType(0);
6869 case Intrinsic::amdgcn_s_buffer_load: {
6881 EVT VT =
Op.getValueType();
6882 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
6894 if (!
Offset->isDivergent()) {
6913 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
6918 case Intrinsic::amdgcn_dead: {
6919 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
6930 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
6931 Results.push_back(Res.getOperand(
I));
6935 Results.push_back(Res.getValue(1));
6944 EVT VT =
N->getValueType(0);
6949 EVT SelectVT = NewVT;
6950 if (NewVT.
bitsLT(MVT::i32)) {
6953 SelectVT = MVT::i32;
6959 if (NewVT != SelectVT)
6965 if (
N->getValueType(0) != MVT::v2f16)
6977 if (
N->getValueType(0) != MVT::v2f16)
6989 if (
N->getValueType(0) != MVT::f16)
7004 if (U.get() !=
Value)
7007 if (U.getUser()->getOpcode() == Opcode)
7013unsigned SITargetLowering::isCFIntrinsic(
const SDNode *
Intr)
const {
7015 switch (
Intr->getConstantOperandVal(1)) {
7016 case Intrinsic::amdgcn_if:
7018 case Intrinsic::amdgcn_else:
7020 case Intrinsic::amdgcn_loop:
7022 case Intrinsic::amdgcn_end_cf:
7069 SDNode *
Intr = BRCOND.getOperand(1).getNode();
7082 assert(BR &&
"brcond missing unconditional branch user");
7083 Target = BR->getOperand(1);
7086 unsigned CFNode = isCFIntrinsic(
Intr);
7105 Ops.
append(
Intr->op_begin() + (HaveChain ? 2 : 1),
Intr->op_end());
7129 for (
unsigned i = 1, e =
Intr->getNumValues() - 1; i != e; ++i) {
7142 Intr->getOperand(0));
7148 MVT VT =
Op.getSimpleValueType();
7151 if (
Op.getConstantOperandVal(0) != 0)
7157 if (
Info->isEntryFunction())
7174 return Op.getValueType().bitsLE(VT)
7182 EVT DstVT =
Op.getValueType();
7189 unsigned Opc =
Op.getOpcode();
7201 EVT SrcVT = Src.getValueType();
7202 EVT DstVT =
Op.getValueType();
7208 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
7215 if (DstVT == MVT::f16) {
7225 if (
Op->getFlags().hasApproximateFuncs()) {
7236 "custom lower FP_ROUND for f16 or bf16");
7250 EVT VT =
Op.getValueType();
7253 bool IsIEEEMode =
Info->getMode().IEEE;
7262 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7269SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
7271 EVT VT =
Op.getValueType();
7274 bool IsIEEEMode =
Info->getMode().IEEE;
7279 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7287 EVT VT =
Op.getValueType();
7294 "should not need to widen f16 minimum/maximum to v2f16");
7308 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7316 EVT VT =
Op.getValueType();
7320 EVT ExpVT =
Exp.getValueType();
7321 if (ExpVT == MVT::i16)
7342 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
7349 switch (
Op->getOpcode()) {
7379 DAGCombinerInfo &DCI)
const {
7380 const unsigned Opc =
Op.getOpcode();
7388 :
Op->getOperand(0).getValueType();
7391 if (DCI.isBeforeLegalizeOps() ||
7395 auto &DAG = DCI.DAG;
7401 LHS =
Op->getOperand(1);
7402 RHS =
Op->getOperand(2);
7404 LHS =
Op->getOperand(0);
7405 RHS =
Op->getOperand(1);
7420 return DAG.
getSetCC(
DL,
Op.getValueType(), LHS, RHS, CC);
7444 if (MagVT == SignVT)
7461 EVT VT =
Op.getValueType();
7467 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
7494 if (
Op->isDivergent())
7507 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7509 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7512 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7514 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7520 EVT VT =
Op.getValueType();
7527 const APInt &
C = RHSC->getAPIntValue();
7529 if (
C.isPowerOf2()) {
7531 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
7558 if (
Op->isDivergent()) {
7575 return lowerTrapEndpgm(
Op, DAG);
7578 : lowerTrapHsaQueuePtr(
Op, DAG);
7588SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
7590 ImplicitParameter Param)
const {
7610 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
7616 if (UserSGPR == AMDGPU::NoRegister) {
7659 "debugtrap handler not supported",
7670SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
7674 ? AMDGPU::SRC_SHARED_BASE
7675 : AMDGPU::SRC_PRIVATE_BASE;
7676 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
7678 "Cannot use src_private_base with globally addressable scratch!");
7701 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7710 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
7716 if (UserSGPR == AMDGPU::NoRegister) {
7746 if (isa<FrameIndexSDNode, GlobalAddressSDNode, BasicBlockSDNode>(Val))
7749 if (
auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7750 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7764 unsigned DestAS, SrcAS;
7766 bool IsNonNull =
false;
7767 if (
const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(
Op)) {
7768 SrcAS = ASC->getSrcAddressSpace();
7769 Src = ASC->getOperand(0);
7770 DestAS = ASC->getDestAddressSpace();
7773 Op.getConstantOperandVal(0) ==
7774 Intrinsic::amdgcn_addrspacecast_nonnull);
7775 Src =
Op->getOperand(1);
7776 SrcAS =
Op->getConstantOperandVal(2);
7777 DestAS =
Op->getConstantOperandVal(3);
7795 AMDGPU::S_MOV_B32, SL, MVT::i32,
7796 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
7804 unsigned NullVal =
TM.getNullPointerValue(DestAS);
7842 AMDGPU::S_MOV_B64, SL, MVT::i64,
7843 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
7845 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
7847 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7855 unsigned NullVal =
TM.getNullPointerValue(SrcAS);
7867 Op.getValueType() == MVT::i64) {
7876 Src.getValueType() == MVT::i64)
7896 EVT InsVT =
Ins.getValueType();
7899 unsigned IdxVal =
Idx->getAsZExtVal();
7904 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
7909 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7911 MVT::i32, InsNumElts / 2);
7916 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
7918 if (InsNumElts == 2) {
7931 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
7953 auto *KIdx = dyn_cast<ConstantSDNode>(
Idx);
7954 if (NumElts == 4 && EltSize == 16 && KIdx) {
7965 unsigned Idx = KIdx->getZExtValue();
7966 bool InsertLo =
Idx < 2;
7983 if (isa<ConstantSDNode>(
Idx))
7989 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
7995 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8024 EVT ResultVT =
Op.getValueType();
8037 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8040 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8044 if (VecSize == 128) {
8052 }
else if (VecSize == 256) {
8055 for (
unsigned P = 0;
P < 4; ++
P) {
8061 Parts[0], Parts[1]));
8063 Parts[2], Parts[3]));
8069 for (
unsigned P = 0;
P < 8; ++
P) {
8076 Parts[0], Parts[1], Parts[2], Parts[3]));
8079 Parts[4], Parts[5], Parts[6], Parts[7]));
8082 EVT IdxVT =
Idx.getValueType();
8099 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8114 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8124 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8129 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8130 !(Mask[Elt + 1] & 1);
8136 EVT ResultVT =
Op.getValueType();
8139 const int NewSrcNumElts = 2;
8141 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
8157 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
8179 if (ShouldUseConsecutiveExtract &&
8182 int VecIdx =
Idx < SrcNumElts ? 0 : 1;
8183 int EltIdx =
Idx < SrcNumElts ?
Idx :
Idx - SrcNumElts;
8195 if (Idx0 >= SrcNumElts) {
8200 if (Idx1 >= SrcNumElts) {
8205 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8206 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8214 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8215 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8220 if (SubVec0 != SubVec1) {
8221 NewMaskIdx1 += NewSrcNumElts;
8228 {NewMaskIdx0, NewMaskIdx1});
8233 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8234 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8235 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8236 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8255 EVT ResultVT =
Op.getValueType();
8271 EVT VT =
Op.getValueType();
8273 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8308 for (
unsigned P = 0;
P < NumParts; ++
P) {
8310 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
8343 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
8389 EVT PtrVT =
Op.getValueType();
8405 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
8483 SDValue Param = lowerKernargMemParameter(
8494 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
8502 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
8510 unsigned NumElts = Elts.
size();
8512 if (NumElts <= 12) {
8521 for (
unsigned i = 0; i < Elts.
size(); ++i) {
8527 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
8537 EVT SrcVT = Src.getValueType();
8558 bool Unpacked,
bool IsD16,
int DMaskPop,
8559 int NumVDataDwords,
bool IsAtomicPacked16Bit,
8563 EVT ReqRetVT = ResultTypes[0];
8565 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
8566 ? (ReqRetNumElts + 1) / 2
8569 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
8580 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
8591 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
8593 NumDataDwords - MaskPopDwords);
8598 EVT LegalReqRetVT = ReqRetVT;
8600 if (!
Data.getValueType().isInteger())
8602 Data.getValueType().changeTypeToInteger(),
Data);
8623 if (Result->getNumValues() == 1)
8630 SDValue *LWE,
bool &IsTexFail) {
8631 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.
getNode());
8650 unsigned DimIdx,
unsigned EndIdx,
8651 unsigned NumGradients) {
8653 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
8661 if (((
I + 1) >= EndIdx) ||
8662 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
8663 I == DimIdx + NumGradients - 1))) {
8664 if (
Addr.getValueType() != MVT::i16)
8685 unsigned IntrOpcode =
Intr->BaseOpcode;
8696 int NumVDataDwords = 0;
8697 bool AdjustRetType =
false;
8698 bool IsAtomicPacked16Bit =
false;
8701 const unsigned ArgOffset = WithChain ? 2 : 1;
8704 unsigned DMaskLanes = 0;
8706 if (BaseOpcode->Atomic) {
8707 VData =
Op.getOperand(2);
8709 IsAtomicPacked16Bit =
8710 (
Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8711 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8714 if (BaseOpcode->AtomicX2) {
8721 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8722 DMask = Is64Bit ? 0xf : 0x3;
8723 NumVDataDwords = Is64Bit ? 4 : 2;
8725 DMask = Is64Bit ? 0x3 : 0x1;
8726 NumVDataDwords = Is64Bit ? 2 : 1;
8729 DMask =
Op->getConstantOperandVal(ArgOffset +
Intr->DMaskIndex);
8732 if (BaseOpcode->Store) {
8733 VData =
Op.getOperand(2);
8741 VData = handleD16VData(VData, DAG,
true);
8745 }
else if (!BaseOpcode->NoReturn) {
8758 (!LoadVT.
isVector() && DMaskLanes > 1))
8766 NumVDataDwords = (DMaskLanes + 1) / 2;
8768 NumVDataDwords = DMaskLanes;
8770 AdjustRetType =
true;
8774 unsigned VAddrEnd = ArgOffset +
Intr->VAddrEnd;
8779 Op.getOperand(ArgOffset +
Intr->GradientStart).getSimpleValueType();
8781 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8782 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8784 VAddrVT =
Op.getOperand(ArgOffset +
Intr->CoordStart).getSimpleValueType();
8786 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8787 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8790 for (
unsigned I =
Intr->VAddrStart; I < Intr->GradientStart;
I++) {
8791 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
8792 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
8797 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
8801 "Bias needs to be converted to 16 bit in A16 mode");
8806 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
8810 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
8811 "require 16 bit args for both gradients and addresses");
8816 if (!
ST->hasA16()) {
8817 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
8818 "support 16 bit addresses\n");
8828 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
8832 IntrOpcode = G16MappingInfo->
G16;
8840 ArgOffset +
Intr->GradientStart,
8841 ArgOffset +
Intr->CoordStart,
Intr->NumGradients);
8843 for (
unsigned I = ArgOffset +
Intr->GradientStart;
8844 I < ArgOffset + Intr->CoordStart;
I++)
8851 ArgOffset +
Intr->CoordStart, VAddrEnd,
8855 for (
unsigned I = ArgOffset +
Intr->CoordStart;
I < VAddrEnd;
I++)
8873 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
8874 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
8875 const bool UseNSA =
ST->hasNSAEncoding() &&
8876 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
8877 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
8878 const bool UsePartialNSA =
8879 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
8882 if (UsePartialNSA) {
8884 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8885 }
else if (!UseNSA) {
8892 if (!BaseOpcode->Sampler) {
8896 Op.getConstantOperandVal(ArgOffset +
Intr->UnormIndex);
8898 Unorm = UnormConst ? True : False;
8903 SDValue TexFail =
Op.getOperand(ArgOffset +
Intr->TexFailCtrlIndex);
8904 bool IsTexFail =
false;
8905 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8916 NumVDataDwords += 1;
8917 AdjustRetType =
true;
8922 if (AdjustRetType) {
8925 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8928 if (isa<MemSDNode>(
Op))
8934 MVT::i32, NumVDataDwords)
8937 ResultTypes[0] = NewVT;
8938 if (ResultTypes.size() == 3) {
8942 ResultTypes.erase(&ResultTypes[1]);
8946 unsigned CPol =
Op.getConstantOperandVal(ArgOffset +
Intr->CachePolicyIndex);
8947 if (BaseOpcode->Atomic)
8954 if (BaseOpcode->Store || BaseOpcode->Atomic)
8956 if (UsePartialNSA) {
8965 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8968 if (BaseOpcode->Sampler) {
8977 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8981 ST->hasFeature(AMDGPU::FeatureR128A16)
8992 "TFE is not supported on this GPU",
DL.getDebugLoc()));
8995 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8999 if (BaseOpcode->HasD16)
9001 if (isa<MemSDNode>(
Op))
9004 int NumVAddrDwords =
9010 NumVDataDwords, NumVAddrDwords);
9011 }
else if (IsGFX11Plus) {
9013 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9014 : AMDGPU::MIMGEncGfx11Default,
9015 NumVDataDwords, NumVAddrDwords);
9016 }
else if (IsGFX10Plus) {
9018 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9019 : AMDGPU::MIMGEncGfx10Default,
9020 NumVDataDwords, NumVAddrDwords);
9024 NumVDataDwords, NumVAddrDwords);
9028 "requested image instruction is not supported on this GPU",
9033 for (
EVT VT : OrigResultTypes) {
9034 if (VT == MVT::Other)
9035 RetValues[
Idx++] =
Op.getOperand(0);
9046 NumVDataDwords, NumVAddrDwords);
9049 NumVDataDwords, NumVAddrDwords);
9055 if (
auto *
MemOp = dyn_cast<MemSDNode>(
Op)) {
9060 if (BaseOpcode->AtomicX2) {
9065 if (BaseOpcode->NoReturn)
9069 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9087 if (!
Offset->isDivergent()) {
9132 return handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
9136 unsigned NumLoads = 1;
9142 if (NumElts == 8 || NumElts == 16) {
9143 NumLoads = NumElts / 4;
9151 setBufferOffsets(
Offset, DAG, &Ops[3],
9152 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
9155 for (
unsigned i = 0; i < NumLoads; ++i) {
9161 if (NumElts == 8 || NumElts == 16)
9213 EVT VT =
Op.getValueType();
9215 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
9219 switch (IntrinsicID) {
9220 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9223 return getPreloadedValue(DAG, *MFI, VT,
9226 case Intrinsic::amdgcn_dispatch_ptr:
9227 case Intrinsic::amdgcn_queue_ptr: {
9230 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
9235 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9238 return getPreloadedValue(DAG, *MFI, VT, RegID);
9240 case Intrinsic::amdgcn_implicitarg_ptr: {
9242 return getImplicitArgPtr(DAG,
DL);
9243 return getPreloadedValue(DAG, *MFI, VT,
9246 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9252 return getPreloadedValue(DAG, *MFI, VT,
9255 case Intrinsic::amdgcn_dispatch_id: {
9258 case Intrinsic::amdgcn_rcp:
9260 case Intrinsic::amdgcn_rsq:
9262 case Intrinsic::amdgcn_rsq_legacy:
9266 case Intrinsic::amdgcn_rcp_legacy:
9270 case Intrinsic::amdgcn_rsq_clamp: {
9284 case Intrinsic::r600_read_ngroups_x:
9288 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9291 case Intrinsic::r600_read_ngroups_y:
9295 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9298 case Intrinsic::r600_read_ngroups_z:
9302 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9305 case Intrinsic::r600_read_local_size_x:
9309 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9311 case Intrinsic::r600_read_local_size_y:
9315 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9317 case Intrinsic::r600_read_local_size_z:
9321 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9323 case Intrinsic::amdgcn_workgroup_id_x:
9324 return getPreloadedValue(DAG, *MFI, VT,
9326 case Intrinsic::amdgcn_workgroup_id_y:
9327 return getPreloadedValue(DAG, *MFI, VT,
9329 case Intrinsic::amdgcn_workgroup_id_z:
9330 return getPreloadedValue(DAG, *MFI, VT,
9332 case Intrinsic::amdgcn_wave_id:
9333 return lowerWaveID(DAG,
Op);
9334 case Intrinsic::amdgcn_lds_kernel_id: {
9336 return getLDSKernelId(DAG,
DL);
9337 return getPreloadedValue(DAG, *MFI, VT,
9340 case Intrinsic::amdgcn_workitem_id_x:
9341 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
9342 case Intrinsic::amdgcn_workitem_id_y:
9343 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
9344 case Intrinsic::amdgcn_workitem_id_z:
9345 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
9346 case Intrinsic::amdgcn_wavefrontsize:
9349 case Intrinsic::amdgcn_s_buffer_load: {
9350 unsigned CPol =
Op.getConstantOperandVal(3);
9357 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
9358 Op.getOperand(3), DAG);
9360 case Intrinsic::amdgcn_fdiv_fast:
9361 return lowerFDIV_FAST(
Op, DAG);
9362 case Intrinsic::amdgcn_sin:
9365 case Intrinsic::amdgcn_cos:
9368 case Intrinsic::amdgcn_mul_u24:
9371 case Intrinsic::amdgcn_mul_i24:
9375 case Intrinsic::amdgcn_log_clamp: {
9381 case Intrinsic::amdgcn_fract:
9384 case Intrinsic::amdgcn_class:
9387 case Intrinsic::amdgcn_div_fmas:
9389 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9391 case Intrinsic::amdgcn_div_fixup:
9393 Op.getOperand(2),
Op.getOperand(3));
9395 case Intrinsic::amdgcn_div_scale: {
9408 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
9411 Denominator, Numerator);
9413 case Intrinsic::amdgcn_icmp: {
9415 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
9416 Op.getConstantOperandVal(2) == 0 &&
9421 case Intrinsic::amdgcn_fcmp: {
9424 case Intrinsic::amdgcn_ballot:
9426 case Intrinsic::amdgcn_fmed3:
9428 Op.getOperand(2),
Op.getOperand(3));
9429 case Intrinsic::amdgcn_fdot2:
9431 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9432 case Intrinsic::amdgcn_fmul_legacy:
9435 case Intrinsic::amdgcn_sffbh:
9437 case Intrinsic::amdgcn_sbfe:
9439 Op.getOperand(2),
Op.getOperand(3));
9440 case Intrinsic::amdgcn_ubfe:
9442 Op.getOperand(2),
Op.getOperand(3));
9443 case Intrinsic::amdgcn_cvt_pkrtz:
9444 case Intrinsic::amdgcn_cvt_pknorm_i16:
9445 case Intrinsic::amdgcn_cvt_pknorm_u16:
9446 case Intrinsic::amdgcn_cvt_pk_i16:
9447 case Intrinsic::amdgcn_cvt_pk_u16: {
9449 EVT VT =
Op.getValueType();
9452 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9454 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9456 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9458 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
9464 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
9467 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
9470 case Intrinsic::amdgcn_fmad_ftz:
9472 Op.getOperand(2),
Op.getOperand(3));
9474 case Intrinsic::amdgcn_if_break:
9476 Op->getOperand(1),
Op->getOperand(2)),
9479 case Intrinsic::amdgcn_groupstaticsize: {
9491 case Intrinsic::amdgcn_is_shared:
9492 case Intrinsic::amdgcn_is_private: {
9499 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9506 AMDGPU::S_MOV_B32,
DL, MVT::i32,
9507 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
9516 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
9519 case Intrinsic::amdgcn_perm:
9521 Op.getOperand(2),
Op.getOperand(3));
9522 case Intrinsic::amdgcn_reloc_constant: {
9526 auto *RelocSymbol = cast<GlobalVariable>(
9532 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
9533 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
9534 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
9535 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
9536 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
9537 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
9538 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
9539 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
9540 if (
Op.getOperand(4).getValueType() == MVT::i32)
9546 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
9547 Op.getOperand(3), IndexKeyi32);
9549 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
9550 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
9551 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
9552 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
9553 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
9554 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
9555 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
9556 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
9557 if (
Op.getOperand(4).getValueType() == MVT::i64)
9563 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9564 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
9567 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
9568 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
9569 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
9570 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
9571 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
9572 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
9573 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
9576 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
9582 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9583 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9584 IndexKey, Op.getOperand(7),
9587 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
9588 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
9589 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
9590 if (
Op.getOperand(6).getValueType() == MVT::i32)
9596 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9597 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9598 IndexKeyi32, Op.getOperand(7)});
9600 case Intrinsic::amdgcn_addrspacecast_nonnull:
9601 return lowerADDRSPACECAST(
Op, DAG);
9602 case Intrinsic::amdgcn_readlane:
9603 case Intrinsic::amdgcn_readfirstlane:
9604 case Intrinsic::amdgcn_writelane:
9605 case Intrinsic::amdgcn_permlane16:
9606 case Intrinsic::amdgcn_permlanex16:
9607 case Intrinsic::amdgcn_permlane64:
9608 case Intrinsic::amdgcn_set_inactive:
9609 case Intrinsic::amdgcn_set_inactive_chain_arg:
9610 case Intrinsic::amdgcn_mov_dpp8:
9611 case Intrinsic::amdgcn_update_dpp:
9613 case Intrinsic::amdgcn_dead: {
9615 for (
const EVT ValTy :
Op.getNode()->values())
9622 return lowerImage(
Op, ImageDimIntr, DAG,
false);
9633 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
9639 unsigned NewOpcode)
const {
9643 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9644 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9658 auto *
M = cast<MemSDNode>(
Op);
9662 M->getMemOperand());
9667 unsigned NewOpcode)
const {
9671 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9672 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9686 auto *
M = cast<MemSDNode>(
Op);
9690 M->getMemOperand());
9695 unsigned IntrID =
Op.getConstantOperandVal(1);
9699 case Intrinsic::amdgcn_ds_ordered_add:
9700 case Intrinsic::amdgcn_ds_ordered_swap: {
9705 unsigned IndexOperand =
M->getConstantOperandVal(7);
9706 unsigned WaveRelease =
M->getConstantOperandVal(8);
9707 unsigned WaveDone =
M->getConstantOperandVal(9);
9709 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9710 IndexOperand &= ~0x3f;
9711 unsigned CountDw = 0;
9714 CountDw = (IndexOperand >> 24) & 0xf;
9715 IndexOperand &= ~(0xf << 24);
9717 if (CountDw < 1 || CountDw > 4) {
9720 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
9729 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
9732 if (WaveDone && !WaveRelease) {
9736 Fn,
"ds_ordered_count: wave_done requires wave_release",
9740 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9741 unsigned ShaderType =
9743 unsigned Offset0 = OrderedCountIndex << 2;
9744 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (
Instruction << 4);
9747 Offset1 |= (CountDw - 1) << 6;
9750 Offset1 |= ShaderType << 2;
9752 unsigned Offset = Offset0 | (Offset1 << 8);
9759 M->getVTList(), Ops,
M->getMemoryVT(),
9760 M->getMemOperand());
9762 case Intrinsic::amdgcn_raw_buffer_load:
9763 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9764 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9765 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9766 case Intrinsic::amdgcn_raw_buffer_load_format:
9767 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9768 const bool IsFormat =
9769 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9770 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9772 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9773 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9786 auto *
M = cast<MemSDNode>(
Op);
9787 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9789 case Intrinsic::amdgcn_struct_buffer_load:
9790 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9791 case Intrinsic::amdgcn_struct_buffer_load_format:
9792 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9793 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9794 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9795 const bool IsFormat =
9796 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9797 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9799 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9800 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9813 return lowerIntrinsicLoad(cast<MemSDNode>(
Op), IsFormat, DAG, Ops);
9815 case Intrinsic::amdgcn_raw_tbuffer_load:
9816 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9818 EVT LoadVT =
Op.getValueType();
9819 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9820 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9839 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9842 case Intrinsic::amdgcn_struct_tbuffer_load:
9843 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9845 EVT LoadVT =
Op.getValueType();
9846 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9847 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9866 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9869 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9870 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9872 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9873 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9874 return lowerStructBufferAtomicIntrin(
Op, DAG,
9876 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9877 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9879 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9880 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9881 return lowerStructBufferAtomicIntrin(
Op, DAG,
9883 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9884 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9886 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9887 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9888 return lowerStructBufferAtomicIntrin(
Op, DAG,
9890 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9891 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9893 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9894 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9896 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9899 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9900 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9902 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9903 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9905 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9906 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9908 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9909 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9911 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9914 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9915 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9917 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9918 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9920 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9921 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9923 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9924 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9926 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9927 return lowerRawBufferAtomicIntrin(
Op, DAG,
9929 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9930 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9931 return lowerStructBufferAtomicIntrin(
Op, DAG,
9933 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9934 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9936 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9937 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9939 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9940 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9941 return lowerStructBufferAtomicIntrin(
Op, DAG,
9943 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9945 return lowerStructBufferAtomicIntrin(
Op, DAG,
9947 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9948 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9949 return lowerStructBufferAtomicIntrin(
Op, DAG,
9951 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9952 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9953 return lowerStructBufferAtomicIntrin(
Op, DAG,
9955 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9956 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9958 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9959 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9961 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9962 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9964 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9965 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9967 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9968 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9970 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9971 return lowerStructBufferAtomicIntrin(
Op, DAG,
9974 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9975 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9976 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
9977 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9991 EVT VT =
Op.getValueType();
9992 auto *
M = cast<MemSDNode>(
Op);
9995 Op->getVTList(), Ops, VT,
9996 M->getMemOperand());
9998 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9999 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10000 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
10001 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
10015 EVT VT =
Op.getValueType();
10016 auto *
M = cast<MemSDNode>(
Op);
10019 Op->getVTList(), Ops, VT,
10020 M->getMemOperand());
10022 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10023 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10025 SDValue NodePtr =
M->getOperand(2);
10026 SDValue RayExtent =
M->getOperand(3);
10027 SDValue InstanceMask =
M->getOperand(4);
10028 SDValue RayOrigin =
M->getOperand(5);
10029 SDValue RayDir =
M->getOperand(6);
10031 SDValue TDescr =
M->getOperand(8);
10041 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10042 const unsigned NumVDataDwords = 10;
10043 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10045 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10046 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10047 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10054 {DAG.getBitcast(MVT::i32, RayExtent),
10055 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10067 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10069 SDValue NodePtr =
M->getOperand(2);
10070 SDValue RayExtent =
M->getOperand(3);
10071 SDValue RayOrigin =
M->getOperand(4);
10072 SDValue RayDir =
M->getOperand(5);
10073 SDValue RayInvDir =
M->getOperand(6);
10074 SDValue TDescr =
M->getOperand(7);
10091 const unsigned NumVDataDwords = 4;
10092 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10093 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10097 const unsigned BaseOpcodes[2][2] = {
10098 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10099 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10100 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10104 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10105 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10106 : AMDGPU::MIMGEncGfx10NSA,
10107 NumVDataDwords, NumVAddrDwords);
10111 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10112 : AMDGPU::MIMGEncGfx10Default,
10113 NumVDataDwords, NumVAddrDwords);
10119 auto packLanes = [&DAG, &Ops, &
DL](
SDValue Op,
bool IsAligned) {
10122 if (Lanes[0].getValueSizeInBits() == 32) {
10123 for (
unsigned I = 0;
I < 3; ++
I)
10142 if (UseNSA && IsGFX11Plus) {
10150 for (
unsigned I = 0;
I < 3; ++
I) {
10153 {DirLanes[I], InvDirLanes[I]})));
10168 packLanes(RayOrigin,
true);
10169 packLanes(RayDir,
true);
10170 packLanes(RayInvDir,
false);
10175 if (NumVAddrDwords > 12) {
10195 case Intrinsic::amdgcn_global_atomic_fmin_num:
10196 case Intrinsic::amdgcn_global_atomic_fmax_num:
10197 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10198 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10205 unsigned Opcode = 0;
10207 case Intrinsic::amdgcn_global_atomic_fmin_num:
10208 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10212 case Intrinsic::amdgcn_global_atomic_fmax_num:
10213 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10221 Ops,
M->getMemOperand());
10223 case Intrinsic::amdgcn_s_get_barrier_state:
10224 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10229 if (isa<ConstantSDNode>(
Op->getOperand(2))) {
10230 uint64_t BarID = cast<ConstantSDNode>(
Op->getOperand(2))->getZExtValue();
10231 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10232 BarID = (BarID >> 4) & 0x3F;
10233 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10238 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10239 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10259 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10267SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
10277 bool IsTFE = VTList.
NumVTs == 3;
10280 unsigned NumOpDWords = NumValueDWords + 1;
10285 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList, Ops,
10286 OpDWordsVT, OpDWordsMMO, DAG);
10291 NumValueDWords == 1
10301 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10307 WidenedMemVT, WidenedMMO);
10317 bool ImageStore)
const {
10352 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
10358 if ((NumElements % 2) == 1) {
10360 unsigned I = Elts.
size() / 2;
10376 if (NumElements == 3) {
10397 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
10400 switch (IntrinsicID) {
10401 case Intrinsic::amdgcn_exp_compr: {
10405 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
10427 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10431 case Intrinsic::amdgcn_struct_tbuffer_store:
10432 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10436 VData = handleD16VData(VData, DAG);
10437 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10438 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10456 M->getMemoryVT(),
M->getMemOperand());
10459 case Intrinsic::amdgcn_raw_tbuffer_store:
10460 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
10464 VData = handleD16VData(VData, DAG);
10465 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10466 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10484 M->getMemoryVT(),
M->getMemOperand());
10487 case Intrinsic::amdgcn_raw_buffer_store:
10488 case Intrinsic::amdgcn_raw_ptr_buffer_store:
10489 case Intrinsic::amdgcn_raw_buffer_store_format:
10490 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
10491 const bool IsFormat =
10492 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
10493 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
10500 VData = handleD16VData(VData, DAG);
10510 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10511 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10531 return handleByteShortBufferStores(DAG, VDataVT,
DL, Ops, M);
10534 M->getMemoryVT(),
M->getMemOperand());
10537 case Intrinsic::amdgcn_struct_buffer_store:
10538 case Intrinsic::amdgcn_struct_ptr_buffer_store:
10539 case Intrinsic::amdgcn_struct_buffer_store_format:
10540 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
10541 const bool IsFormat =
10542 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
10543 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
10551 VData = handleD16VData(VData, DAG);
10561 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10562 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10583 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
10586 M->getMemoryVT(),
M->getMemOperand());
10588 case Intrinsic::amdgcn_raw_buffer_load_lds:
10589 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
10590 case Intrinsic::amdgcn_struct_buffer_load_lds:
10591 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
10596 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
10597 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
10598 unsigned OpOffset = HasVIndex ? 1 : 0;
10599 SDValue VOffset =
Op.getOperand(5 + OpOffset);
10601 unsigned Size =
Op->getConstantOperandVal(4);
10607 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
10608 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
10609 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
10610 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
10613 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
10614 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
10615 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
10616 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
10619 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
10620 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
10621 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
10622 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
10627 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
10628 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
10629 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
10630 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
10635 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
10636 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
10637 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
10638 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
10646 if (HasVIndex && HasVOffset)
10650 else if (HasVIndex)
10652 else if (HasVOffset)
10655 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10660 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
10672 auto *
M = cast<MemSDNode>(
Op);
10702 case Intrinsic::amdgcn_load_to_lds:
10703 case Intrinsic::amdgcn_global_load_lds: {
10708 unsigned Size =
Op->getConstantOperandVal(4);
10713 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
10716 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
10719 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10724 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10729 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10745 if (
LHS->isDivergent())
10749 RHS.getOperand(0).getValueType() == MVT::i32) {
10752 VOffset =
RHS.getOperand(0);
10757 if (!
Addr->isDivergent()) {
10772 auto *
M = cast<MemSDNode>(
Op);
10775 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
10795 case Intrinsic::amdgcn_end_cf:
10797 Op->getOperand(2), Chain),
10799 case Intrinsic::amdgcn_s_barrier_init:
10800 case Intrinsic::amdgcn_s_barrier_signal_var: {
10807 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10808 ? AMDGPU::S_BARRIER_INIT_M0
10809 : AMDGPU::S_BARRIER_SIGNAL_M0;
10824 constexpr unsigned ShAmt = 16;
10836 case Intrinsic::amdgcn_s_barrier_join: {
10843 if (isa<ConstantSDNode>(BarOp)) {
10844 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10845 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10848 unsigned BarID = (BarVal >> 4) & 0x3F;
10853 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10869 case Intrinsic::amdgcn_s_prefetch_data: {
10872 return Op.getOperand(0);
10875 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10877 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
10884 Op->getVTList(), Ops,
M->getMemoryVT(),
10885 M->getMemOperand());
10890 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10900 Addr->getFlags().hasNoUnsignedWrap()) ||
10915std::pair<SDValue, SDValue>
10922 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10930 C1 = cast<ConstantSDNode>(N0.
getOperand(1));
10945 unsigned Overflow = ImmOffset & ~MaxImm;
10946 ImmOffset -= Overflow;
10947 if ((int32_t)Overflow < 0) {
10948 Overflow += ImmOffset;
10953 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
10957 SDValue Ops[] = {N0, OverflowVal};
10972void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
10974 Align Alignment)
const {
10977 if (
auto *
C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10980 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10991 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10993 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
11010SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
11013 return MaybePointer;
11027 SDValue NumRecords =
Op->getOperand(3);
11030 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11033 std::optional<uint32_t> ConstStride = std::nullopt;
11034 if (
auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
11035 ConstStride = ConstNode->getZExtValue();
11038 if (!ConstStride || *ConstStride != 0) {
11041 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
11052 NewHighHalf, NumRecords, Flags);
11062 bool IsTFE)
const {
11072 SDValue Op = getMemIntrinsicNode(
Opc,
DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11100 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11104 Ops[1] = BufferStoreExt;
11109 M->getMemOperand());
11134 DAGCombinerInfo &DCI)
const {
11150 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
11157 "unexpected vector extload");
11170 "unexpected fp extload");
11188 DCI.AddToWorklist(Cvt.
getNode());
11193 DCI.AddToWorklist(Cvt.
getNode());
11204 if (
Info.isEntryFunction())
11205 return Info.getUserSGPRInfo().hasFlatScratchInit();
11213 EVT MemVT =
Load->getMemoryVT();
11226 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11254 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
11255 "Custom lowering for non-i32 vectors hasn't been implemented.");
11258 unsigned AS =
Load->getAddressSpace();
11282 Alignment >=
Align(4) && NumElements < 32) {
11296 if (NumElements > 4)
11315 if (NumElements > 2)
11320 if (NumElements > 4)
11332 auto Flags =
Load->getMemOperand()->getFlags();
11334 Load->getAlign(), Flags, &
Fast) &&
11343 MemVT, *
Load->getMemOperand())) {
11352 EVT VT =
Op.getValueType();
11389 EVT VT =
Op.getValueType();
11392 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
11398 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11401 if (CLHS->isExactlyValue(1.0)) {
11418 if (CLHS->isExactlyValue(-1.0)) {
11427 if (!AllowInaccurateRcp &&
11428 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
11442 EVT VT =
Op.getValueType();
11445 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
11446 if (!AllowInaccurateDiv)
11467 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
11481 return DAG.
getNode(Opcode, SL, VTList,
11490 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
11504 return DAG.
getNode(Opcode, SL, VTList,
11510 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
11511 return FastLowered;
11514 EVT VT =
Op.getValueType();
11521 if (VT == MVT::bf16) {
11544 unsigned FMADOpCode =
11551 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11553 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
11554 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11577 const APFloat K0Val(0x1p+96f);
11580 const APFloat K1Val(0x1p-32f);
11607 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
11608 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
11609 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
11614 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
11615 return FastLowered;
11622 Flags.setNoFPExcept(
true);
11643 using namespace AMDGPU::Hwreg;
11644 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
11652 const bool HasDynamicDenormals =
11658 if (!PreservesDenormals) {
11666 if (HasDynamicDenormals) {
11670 SavedDenormMode =
SDValue(GetReg, 0);
11678 const SDValue EnableDenormValue =
11685 const SDValue EnableDenormValue =
11687 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
11688 {EnableDenormValue,
BitField, Glue});
11698 ApproxRcp, One, NegDivScale0, Flags);
11701 ApproxRcp, Fma0, Flags);
11707 NumeratorScaled,
Mul, Flags);
11713 NumeratorScaled, Fma3, Flags);
11715 if (!PreservesDenormals) {
11727 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
11728 const SDValue DisableDenormValue =
11729 HasDynamicDenormals
11734 AMDGPU::S_SETREG_B32, SL, MVT::Other,
11745 {Fma4, Fma1, Fma3, Scale},
Flags);
11751 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
11752 return FastLowered;
11820 EVT VT =
Op.getValueType();
11822 if (VT == MVT::f32)
11823 return LowerFDIV32(
Op, DAG);
11825 if (VT == MVT::f64)
11826 return LowerFDIV64(
Op, DAG);
11828 if (VT == MVT::f16 || VT == MVT::bf16)
11829 return LowerFDIV16(
Op, DAG);
11838 EVT ResultExpVT =
Op->getValueType(1);
11839 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11869 if (VT == MVT::i1) {
11873 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
11877 Store->getValue().getValueType().getScalarType() == MVT::i32);
11879 unsigned AS =
Store->getAddressSpace();
11898 if (NumElements > 4)
11905 VT, *
Store->getMemOperand()))
11915 if (NumElements > 2)
11919 if (NumElements > 4 ||
11928 auto Flags =
Store->getMemOperand()->getFlags();
11963 MVT VT =
Op.getValueType().getSimpleVT();
12134 EVT VT =
Op.getValueType();
12151 switch (
Op.getOpcode()) {
12178 EVT VT =
Op.getValueType();
12186 Op->getVTList(), Ops, VT,
12195SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
12196 DAGCombinerInfo &DCI)
const {
12197 EVT VT =
N->getValueType(0);
12199 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12206 EVT SrcVT = Src.getValueType();
12212 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12215 DCI.AddToWorklist(Cvt.
getNode());
12218 if (ScalarVT != MVT::f32) {
12230 DAGCombinerInfo &DCI)
const {
12231 SDValue MagnitudeOp =
N->getOperand(0);
12232 SDValue SignOp =
N->getOperand(1);
12260 for (
unsigned I = 0;
I != NumElts; ++
I) {
12284 if (NewElts.
size() == 1)
12306 for (
unsigned I = 0;
I != NumElts; ++
I) {
12341SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
12343 DAGCombinerInfo &DCI)
const {
12373 AM.HasBaseReg =
true;
12374 AM.BaseOffs =
Offset.getSExtValue();
12379 EVT VT =
N->getValueType(0);
12385 Flags.setNoUnsignedWrap(
12386 N->getFlags().hasNoUnsignedWrap() &&
12396 switch (
N->getOpcode()) {
12407 DAGCombinerInfo &DCI)
const {
12415 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
12416 N->getMemoryVT(), DCI);
12420 NewOps[PtrIdx] = NewPtr;
12429 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
12430 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
12439SDValue SITargetLowering::splitBinaryBitConstantOp(
12465 if (V.getValueType() != MVT::i1)
12467 switch (V.getOpcode()) {
12484 return V.getResNo() == 1;
12486 unsigned IntrinsicID = V.getConstantOperandVal(0);
12487 switch (IntrinsicID) {
12488 case Intrinsic::amdgcn_is_shared:
12489 case Intrinsic::amdgcn_is_private:
12506 if (!(
C & 0x000000ff))
12507 ZeroByteMask |= 0x000000ff;
12508 if (!(
C & 0x0000ff00))
12509 ZeroByteMask |= 0x0000ff00;
12510 if (!(
C & 0x00ff0000))
12511 ZeroByteMask |= 0x00ff0000;
12512 if (!(
C & 0xff000000))
12513 ZeroByteMask |= 0xff000000;
12514 uint32_t NonZeroByteMask = ~ZeroByteMask;
12515 if ((NonZeroByteMask &
C) != NonZeroByteMask)
12528 assert(V.getValueSizeInBits() == 32);
12530 if (V.getNumOperands() != 2)
12539 switch (V.getOpcode()) {
12544 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
12549 return (0x03020100 & ~ConstMask) | ConstMask;
12556 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
12562 return uint32_t(0x0c0c0c0c03020100ull >>
C);
12569 DAGCombinerInfo &DCI)
const {
12570 if (DCI.isBeforeLegalize())
12574 EVT VT =
N->getValueType(0);
12579 if (VT == MVT::i64 && CRHS) {
12585 if (CRHS && VT == MVT::i32) {
12594 if (
auto *CShift = dyn_cast<ConstantSDNode>(
LHS->getOperand(1))) {
12595 unsigned Shift = CShift->getZExtValue();
12597 unsigned Offset = NB + Shift;
12598 if ((
Offset & (Bits - 1)) == 0) {
12616 isa<ConstantSDNode>(
LHS.getOperand(2))) {
12622 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
12637 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
12642 if (
X !=
LHS.getOperand(1))
12647 dyn_cast<ConstantFPSDNode>(
RHS.getOperand(1));
12680 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
12681 LHS.getOperand(0) ==
LHS.getOperand(1))) {
12683 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
12684 :
Mask->getZExtValue() & OrdMask;
12705 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12708 if (LHSMask != ~0u && RHSMask != ~0u) {
12711 if (LHSMask > RHSMask) {
12718 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12719 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12722 if (!(LHSUsedLanes & RHSUsedLanes) &&
12725 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12732 for (
unsigned I = 0;
I < 32;
I += 8) {
12734 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
12735 Mask &= (0x0c <<
I) & 0xffffffff;
12793static const std::optional<ByteProvider<SDValue>>
12795 unsigned Depth = 0) {
12798 return std::nullopt;
12800 if (
Op.getValueSizeInBits() < 8)
12801 return std::nullopt;
12803 if (
Op.getValueType().isVector())
12806 switch (
Op->getOpcode()) {
12817 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
12818 NarrowVT = VTSign->getVT();
12821 return std::nullopt;
12824 if (SrcIndex >= NarrowByteWidth)
12825 return std::nullopt;
12831 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12833 return std::nullopt;
12835 uint64_t BitShift = ShiftOp->getZExtValue();
12837 if (BitShift % 8 != 0)
12838 return std::nullopt;
12840 SrcIndex += BitShift / 8;
12858static const std::optional<ByteProvider<SDValue>>
12860 unsigned StartingIndex = 0) {
12864 return std::nullopt;
12866 unsigned BitWidth =
Op.getScalarValueSizeInBits();
12868 return std::nullopt;
12870 return std::nullopt;
12872 bool IsVec =
Op.getValueType().isVector();
12873 switch (
Op.getOpcode()) {
12876 return std::nullopt;
12881 return std::nullopt;
12885 return std::nullopt;
12888 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
12889 return std::nullopt;
12890 if (!
LHS ||
LHS->isConstantZero())
12892 if (!
RHS ||
RHS->isConstantZero())
12894 return std::nullopt;
12899 return std::nullopt;
12901 auto *BitMaskOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12903 return std::nullopt;
12905 uint32_t BitMask = BitMaskOp->getZExtValue();
12907 uint32_t IndexMask = 0xFF << (Index * 8);
12909 if ((IndexMask & BitMask) != IndexMask) {
12912 if (IndexMask & BitMask)
12913 return std::nullopt;
12922 return std::nullopt;
12925 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12926 if (!ShiftOp ||
Op.getValueType().isVector())
12927 return std::nullopt;
12929 uint64_t BitsProvided =
Op.getValueSizeInBits();
12930 if (BitsProvided % 8 != 0)
12931 return std::nullopt;
12933 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12935 return std::nullopt;
12937 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12938 uint64_t ByteShift = BitShift / 8;
12940 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12941 uint64_t BytesProvided = BitsProvided / 8;
12942 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12943 NewIndex %= BytesProvided;
12950 return std::nullopt;
12952 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12954 return std::nullopt;
12956 uint64_t BitShift = ShiftOp->getZExtValue();
12958 return std::nullopt;
12960 auto BitsProvided =
Op.getScalarValueSizeInBits();
12961 if (BitsProvided % 8 != 0)
12962 return std::nullopt;
12964 uint64_t BytesProvided = BitsProvided / 8;
12965 uint64_t ByteShift = BitShift / 8;
12970 return BytesProvided - ByteShift > Index
12978 return std::nullopt;
12980 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12982 return std::nullopt;
12984 uint64_t BitShift = ShiftOp->getZExtValue();
12985 if (BitShift % 8 != 0)
12986 return std::nullopt;
12987 uint64_t ByteShift = BitShift / 8;
12993 return Index < ByteShift
12996 Depth + 1, StartingIndex);
13005 return std::nullopt;
13012 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
13013 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13015 if (NarrowBitWidth % 8 != 0)
13016 return std::nullopt;
13017 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13019 if (Index >= NarrowByteWidth)
13021 ? std::optional<ByteProvider<SDValue>>(
13029 return std::nullopt;
13033 if (NarrowByteWidth >= Index) {
13038 return std::nullopt;
13045 return std::nullopt;
13049 auto *L = cast<LoadSDNode>(
Op.getNode());
13051 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13052 if (NarrowBitWidth % 8 != 0)
13053 return std::nullopt;
13054 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13059 if (Index >= NarrowByteWidth) {
13061 ? std::optional<ByteProvider<SDValue>>(
13066 if (NarrowByteWidth > Index) {
13070 return std::nullopt;
13075 return std::nullopt;
13078 Depth + 1, StartingIndex);
13082 auto *IdxOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
13084 return std::nullopt;
13085 auto VecIdx = IdxOp->getZExtValue();
13086 auto ScalarSize =
Op.getScalarValueSizeInBits();
13087 if (ScalarSize < 32)
13088 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13090 StartingIndex, Index);
13095 return std::nullopt;
13097 auto *PermMask = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
13099 return std::nullopt;
13102 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13103 if (IdxMask > 0x07 && IdxMask != 0x0c)
13104 return std::nullopt;
13106 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13107 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13109 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
13115 return std::nullopt;
13130 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
13134 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13137 auto MemVT = L->getMemoryVT();
13140 return L->getMemoryVT().getSizeInBits() == 16;
13150 int Low8 = Mask & 0xff;
13151 int Hi8 = (Mask & 0xff00) >> 8;
13153 assert(Low8 < 8 && Hi8 < 8);
13155 bool IsConsecutive = (Hi8 - Low8 == 1);
13160 bool Is16Aligned = !(Low8 % 2);
13162 return IsConsecutive && Is16Aligned;
13170 int Low16 = PermMask & 0xffff;
13171 int Hi16 = (PermMask & 0xffff0000) >> 16;
13181 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13183 if (!OtherOpIs16Bit)
13191 unsigned DWordOffset) {
13194 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13196 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13201 if (Src.getValueType().isVector()) {
13202 auto ScalarTySize = Src.getScalarValueSizeInBits();
13203 auto ScalarTy = Src.getValueType().getScalarType();
13204 if (ScalarTySize == 32) {
13208 if (ScalarTySize > 32) {
13211 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13212 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13219 assert(ScalarTySize < 32);
13220 auto NumElements =
TypeSize / ScalarTySize;
13221 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13222 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13223 auto NumElementsIn32 = 32 / ScalarTySize;
13224 auto NumAvailElements = DWordOffset < Trunc32Elements
13226 : NumElements - NormalizedTrunc;
13239 auto ShiftVal = 32 * DWordOffset;
13247 [[maybe_unused]]
EVT VT =
N->getValueType(0);
13252 for (
int i = 0; i < 4; i++) {
13254 std::optional<ByteProvider<SDValue>>
P =
13257 if (!
P ||
P->isConstantZero())
13262 if (PermNodes.
size() != 4)
13265 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13266 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13268 for (
size_t i = 0; i < PermNodes.
size(); i++) {
13269 auto PermOp = PermNodes[i];
13272 int SrcByteAdjust = 4;
13276 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13277 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13279 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13280 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13284 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13285 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13288 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13290 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13293 SDValue Op = *PermNodes[FirstSrc.first].Src;
13295 assert(
Op.getValueSizeInBits() == 32);
13299 int Low16 = PermMask & 0xffff;
13300 int Hi16 = (PermMask & 0xffff0000) >> 16;
13302 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13303 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13306 if (WellFormedLow && WellFormedHi)
13310 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
13319 assert(
Op.getValueType().isByteSized() &&
13337 DAGCombinerInfo &DCI)
const {
13342 EVT VT =
N->getValueType(0);
13343 if (VT == MVT::i1) {
13348 if (Src !=
RHS.getOperand(0))
13353 if (!CLHS || !CRHS)
13357 static const uint32_t MaxMask = 0x3ff;
13372 isa<ConstantSDNode>(
LHS.getOperand(2))) {
13377 Sel |=
LHS.getConstantOperandVal(2);
13386 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13390 auto usesCombinedOperand = [](
SDNode *OrUse) {
13393 !OrUse->getValueType(0).isVector())
13397 for (
auto *VUser : OrUse->users()) {
13398 if (!VUser->getValueType(0).isVector())
13405 if (VUser->getOpcode() == VectorwiseOp)
13411 if (!
any_of(
N->users(), usesCombinedOperand))
13417 if (LHSMask != ~0u && RHSMask != ~0u) {
13420 if (LHSMask > RHSMask) {
13427 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13428 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13431 if (!(LHSUsedLanes & RHSUsedLanes) &&
13434 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13436 LHSMask &= ~RHSUsedLanes;
13437 RHSMask &= ~LHSUsedLanes;
13439 LHSMask |= LHSUsedLanes & 0x04040404;
13449 if (LHSMask == ~0u || RHSMask == ~0u) {
13455 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
13470 if (SrcVT == MVT::i32) {
13475 DCI.AddToWorklist(LowOr.
getNode());
13476 DCI.AddToWorklist(HiBits.getNode());
13484 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(
N->getOperand(1));
13487 N->getOperand(0), CRHS))
13495 DAGCombinerInfo &DCI)
const {
13496 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
13505 EVT VT =
N->getValueType(0);
13506 if (CRHS && VT == MVT::i64) {
13528 LHS->getOperand(0), FNegLHS, FNegRHS);
13537 DAGCombinerInfo &DCI)
const {
13542 EVT VT =
N->getValueType(0);
13543 if (VT != MVT::i32)
13547 if (Src.getValueType() != MVT::i16)
13554SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
13555 DAGCombinerInfo &DCI)
const {
13557 auto *VTSign = cast<VTSDNode>(
N->getOperand(1));
13562 VTSign->getVT() == MVT::i8) ||
13564 VTSign->getVT() == MVT::i16))) {
13566 "s_buffer_load_{u8, i8} are supported "
13567 "in GFX12 (or newer) architectures.");
13568 EVT VT = Src.getValueType();
13573 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
13579 auto *
M = cast<MemSDNode>(Src);
13580 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
13581 Opc,
DL, ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
13586 VTSign->getVT() == MVT::i8) ||
13588 VTSign->getVT() == MVT::i16)) &&
13590 auto *
M = cast<MemSDNode>(Src);
13591 SDValue Ops[] = {Src.getOperand(0),
13597 Src.getOperand(6), Src.getOperand(7)};
13600 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
13604 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
13605 Opc,
SDLoc(
N), ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
13606 return DCI.DAG.getMergeValues(
13613 DAGCombinerInfo &DCI)
const {
13621 if (
N->getOperand(0).isUndef())
13628 DAGCombinerInfo &DCI)
const {
13629 EVT VT =
N->getValueType(0);
13654 unsigned MaxDepth)
const {
13655 unsigned Opcode =
Op.getOpcode();
13659 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(
Op)) {
13660 const auto &
F = CFP->getValueAPF();
13661 if (
F.isNaN() &&
F.isSignaling())
13663 if (!
F.isDenormal())
13726 if (
Op.getValueType() == MVT::i32) {
13731 if (
auto *
RHS = dyn_cast<ConstantSDNode>(
Op.getOperand(1))) {
13732 if (
RHS->getZExtValue() == 0xffff0000) {
13742 return Op.getValueType().getScalarType() != MVT::f16;
13812 if (
Op.getValueType() == MVT::i16) {
13823 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
13825 switch (IntrinsicID) {
13826 case Intrinsic::amdgcn_cvt_pkrtz:
13827 case Intrinsic::amdgcn_cubeid:
13828 case Intrinsic::amdgcn_frexp_mant:
13829 case Intrinsic::amdgcn_fdot2:
13830 case Intrinsic::amdgcn_rcp:
13831 case Intrinsic::amdgcn_rsq:
13832 case Intrinsic::amdgcn_rsq_clamp:
13833 case Intrinsic::amdgcn_rcp_legacy:
13834 case Intrinsic::amdgcn_rsq_legacy:
13835 case Intrinsic::amdgcn_trig_preop:
13836 case Intrinsic::amdgcn_tanh:
13837 case Intrinsic::amdgcn_log:
13838 case Intrinsic::amdgcn_exp2:
13839 case Intrinsic::amdgcn_sqrt:
13857 unsigned MaxDepth)
const {
13860 unsigned Opcode =
MI->getOpcode();
13862 if (Opcode == AMDGPU::G_FCANONICALIZE)
13865 std::optional<FPValueAndVReg> FCR;
13868 if (FCR->Value.isSignaling())
13870 if (!FCR->Value.isDenormal())
13881 case AMDGPU::G_FADD:
13882 case AMDGPU::G_FSUB:
13883 case AMDGPU::G_FMUL:
13884 case AMDGPU::G_FCEIL:
13885 case AMDGPU::G_FFLOOR:
13886 case AMDGPU::G_FRINT:
13887 case AMDGPU::G_FNEARBYINT:
13888 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13889 case AMDGPU::G_INTRINSIC_TRUNC:
13890 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13891 case AMDGPU::G_FMA:
13892 case AMDGPU::G_FMAD:
13893 case AMDGPU::G_FSQRT:
13894 case AMDGPU::G_FDIV:
13895 case AMDGPU::G_FREM:
13896 case AMDGPU::G_FPOW:
13897 case AMDGPU::G_FPEXT:
13898 case AMDGPU::G_FLOG:
13899 case AMDGPU::G_FLOG2:
13900 case AMDGPU::G_FLOG10:
13901 case AMDGPU::G_FPTRUNC:
13902 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13903 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13904 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13905 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13906 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13908 case AMDGPU::G_FNEG:
13909 case AMDGPU::G_FABS:
13910 case AMDGPU::G_FCOPYSIGN:
13912 case AMDGPU::G_FMINNUM:
13913 case AMDGPU::G_FMAXNUM:
13914 case AMDGPU::G_FMINNUM_IEEE:
13915 case AMDGPU::G_FMAXNUM_IEEE:
13916 case AMDGPU::G_FMINIMUM:
13917 case AMDGPU::G_FMAXIMUM:
13918 case AMDGPU::G_FMINIMUMNUM:
13919 case AMDGPU::G_FMAXIMUMNUM: {
13927 case AMDGPU::G_BUILD_VECTOR:
13932 case AMDGPU::G_INTRINSIC:
13933 case AMDGPU::G_INTRINSIC_CONVERGENT:
13935 case Intrinsic::amdgcn_fmul_legacy:
13936 case Intrinsic::amdgcn_fmad_ftz:
13937 case Intrinsic::amdgcn_sqrt:
13938 case Intrinsic::amdgcn_fmed3:
13939 case Intrinsic::amdgcn_sin:
13940 case Intrinsic::amdgcn_cos:
13941 case Intrinsic::amdgcn_log:
13942 case Intrinsic::amdgcn_exp2:
13943 case Intrinsic::amdgcn_log_clamp:
13944 case Intrinsic::amdgcn_rcp:
13945 case Intrinsic::amdgcn_rcp_legacy:
13946 case Intrinsic::amdgcn_rsq:
13947 case Intrinsic::amdgcn_rsq_clamp:
13948 case Intrinsic::amdgcn_rsq_legacy:
13949 case Intrinsic::amdgcn_div_scale:
13950 case Intrinsic::amdgcn_div_fmas:
13951 case Intrinsic::amdgcn_div_fixup:
13952 case Intrinsic::amdgcn_fract:
13953 case Intrinsic::amdgcn_cvt_pkrtz:
13954 case Intrinsic::amdgcn_cubeid:
13955 case Intrinsic::amdgcn_cubema:
13956 case Intrinsic::amdgcn_cubesc:
13957 case Intrinsic::amdgcn_cubetc:
13958 case Intrinsic::amdgcn_frexp_mant:
13959 case Intrinsic::amdgcn_fdot2:
13960 case Intrinsic::amdgcn_trig_preop:
13961 case Intrinsic::amdgcn_tanh:
13980 if (
C.isDenormal()) {
13994 if (
C.isSignaling()) {
14013 return Op.isUndef() || isa<ConstantFPSDNode>(
Op);
14017SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
14018 DAGCombinerInfo &DCI)
const {
14021 EVT VT =
N->getValueType(0);
14030 EVT VT =
N->getValueType(0);
14031 return getCanonicalConstantFP(DAG,
SDLoc(
N), VT, CFP->getValueAPF());
14047 EVT EltVT =
Lo.getValueType();
14050 for (
unsigned I = 0;
I != 2; ++
I) {
14054 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14055 }
else if (
Op.isUndef()) {
14067 if (isa<ConstantFPSDNode>(NewElts[1]))
14068 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14074 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14128 if (!MinK || !MaxK)
14141 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->
hasMed3_16()))
14142 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14207 if (
Info->getMode().DX10Clamp) {
14216 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->
hasMed3_16())) {
14250 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
14261 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
14270 DAGCombinerInfo &DCI)
const {
14273 EVT VT =
N->getValueType(0);
14274 unsigned Opc =
N->getOpcode();
14303 if (
SDValue Med3 = performIntMed3ImmCombine(
14308 if (
SDValue Med3 = performIntMed3ImmCombine(
14314 if (
SDValue Med3 = performIntMed3ImmCombine(
14319 if (
SDValue Med3 = performIntMed3ImmCombine(
14334 (VT == MVT::f32 || VT == MVT::f64 ||
14340 if (
SDValue Res = performFPMed3ImmCombine(DAG,
SDLoc(
N), Op0, Op1))
14362 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
14363 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
14372 DAGCombinerInfo &DCI)
const {
14373 EVT VT =
N->getValueType(0);
14396 if (
Info->getMode().DX10Clamp) {
14399 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
14402 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
14405 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
14416 DAGCombinerInfo &DCI)
const {
14420 return DCI.DAG.getUNDEF(
N->getValueType(0));
14428 bool IsDivergentIdx,
14433 unsigned VecSize = EltSize * NumElem;
14436 if (VecSize <= 64 && EltSize < 32)
14445 if (IsDivergentIdx)
14449 unsigned NumInsts = NumElem +
14450 ((EltSize + 31) / 32) * NumElem ;
14455 return NumInsts <= 16;
14460 return NumInsts <= 15;
14467 if (isa<ConstantSDNode>(
Idx))
14481SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
14482 DAGCombinerInfo &DCI)
const {
14488 EVT ResVT =
N->getValueType(0);
14507 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
14535 DCI.AddToWorklist(Elt0.
getNode());
14536 DCI.AddToWorklist(Elt1.
getNode());
14558 if (!DCI.isBeforeLegalize())
14564 auto *
Idx = dyn_cast<ConstantSDNode>(
N->getOperand(1));
14565 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.
isByteSized() &&
14566 VecSize > 32 && VecSize % 32 == 0 &&
Idx) {
14569 unsigned BitIndex =
Idx->getZExtValue() * VecEltSize;
14570 unsigned EltIdx = BitIndex / 32;
14571 unsigned LeftoverBitIdx = BitIndex % 32;
14575 DCI.AddToWorklist(Cast.
getNode());
14579 DCI.AddToWorklist(Elt.
getNode());
14582 DCI.AddToWorklist(Srl.
getNode());
14586 DCI.AddToWorklist(Trunc.
getNode());
14588 if (VecEltVT == ResVT) {
14600SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
14601 DAGCombinerInfo &DCI)
const {
14615 EVT IdxVT =
Idx.getValueType();
14632 Src.getOperand(0).getValueType() == MVT::f16) {
14633 return Src.getOperand(0);
14636 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
14637 APFloat Val = CFP->getValueAPF();
14638 bool LosesInfo =
true;
14648 DAGCombinerInfo &DCI)
const {
14650 "combine only useful on gfx8");
14652 SDValue TruncSrc =
N->getOperand(0);
14653 EVT VT =
N->getValueType(0);
14654 if (VT != MVT::f16)
14692unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
14694 const SDNode *N1)
const {
14699 if (((VT == MVT::f32 &&
14701 (VT == MVT::f16 && Subtarget->
hasMadF16() &&
14721 EVT VT =
N->getValueType(0);
14722 if (VT != MVT::i32 && VT != MVT::i64)
14728 unsigned Opc =
N->getOpcode();
14783 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
14802 DAGCombinerInfo &DCI)
const {
14806 EVT VT =
N->getValueType(0);
14816 if (!
N->isDivergent() && Subtarget->
hasSMulHi())
14820 if (NumBits <= 32 || NumBits > 64)
14832 unsigned NumUsers = 0;
14836 if (!
User->isAnyAdd())
14860 bool MulSignedLo =
false;
14861 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
14870 if (VT != MVT::i64) {
14893 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
14895 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
14896 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
14898 if (!MulLHSUnsigned32) {
14905 if (!MulRHSUnsigned32) {
14916 if (VT != MVT::i64)
14922SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
14923 DAGCombinerInfo &DCI)
const {
14925 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14948 unsigned Opcode =
N->getOpcode();
14952 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
14963static std::optional<ByteProvider<SDValue>>
14966 if (!Byte0 || Byte0->isConstantZero()) {
14967 return std::nullopt;
14970 if (Byte1 && !Byte1->isConstantZero()) {
14971 return std::nullopt;
14977 unsigned FirstCs =
First & 0x0c0c0c0c;
14978 unsigned SecondCs = Second & 0x0c0c0c0c;
14979 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
14980 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14982 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14983 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14984 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14985 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14987 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15011 for (
int BPI = 0; BPI < 2; BPI++) {
15014 BPP = {Src1, Src0};
15016 unsigned ZeroMask = 0x0c0c0c0c;
15017 unsigned FMask = 0xFF << (8 * (3 - Step));
15019 unsigned FirstMask =
15020 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15021 unsigned SecondMask =
15022 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15026 int FirstGroup = -1;
15027 for (
int I = 0;
I < 2;
I++) {
15029 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
15030 return IterElt.SrcOp == *BPP.first.Src &&
15031 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15035 if (Match != Srcs.
end()) {
15036 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
15041 if (FirstGroup != -1) {
15043 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
15044 return IterElt.SrcOp == *BPP.second.Src &&
15045 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15048 if (Match != Srcs.
end()) {
15049 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
15051 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15059 unsigned ZeroMask = 0x0c0c0c0c;
15060 unsigned FMask = 0xFF << (8 * (3 - Step));
15064 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15068 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15077 if (Srcs.
size() == 1) {
15078 auto *Elt = Srcs.
begin();
15082 if (Elt->PermMask == 0x3020100)
15089 auto *FirstElt = Srcs.
begin();
15090 auto *SecondElt = std::next(FirstElt);
15097 auto FirstMask = FirstElt->PermMask;
15098 auto SecondMask = SecondElt->PermMask;
15100 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15101 unsigned FirstPlusFour = FirstMask | 0x04040404;
15104 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15116 FirstElt = std::next(SecondElt);
15117 if (FirstElt == Srcs.
end())
15120 SecondElt = std::next(FirstElt);
15123 if (SecondElt == Srcs.
end()) {
15129 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
15135 return Perms.
size() == 2
15141 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15142 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15143 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15144 EntryMask += ZeroMask;
15149 auto Opcode =
Op.getOpcode();
15155static std::optional<bool>
15166 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15169 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15171 assert(!(S0IsUnsigned && S0IsSigned));
15172 assert(!(S1IsUnsigned && S1IsSigned));
15180 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15186 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15187 return std::nullopt;
15199 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15200 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15205 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15211 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15212 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15213 return std::nullopt;
15219 DAGCombinerInfo &DCI)
const {
15221 EVT VT =
N->getValueType(0);
15228 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
15233 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
15237 if (VT == MVT::i64) {
15238 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15245 std::optional<bool> IsSigned;
15251 int ChainLength = 0;
15252 for (
int I = 0;
I < 4;
I++) {
15253 auto MulIdx =
isMul(LHS) ? 0 :
isMul(RHS) ? 1 : -1;
15256 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15259 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15264 TempNode->getOperand(MulIdx), *Src0, *Src1,
15265 TempNode->getOperand(MulIdx)->getOperand(0),
15266 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15270 IsSigned = *IterIsSigned;
15271 if (*IterIsSigned != *IsSigned)
15274 auto AddIdx = 1 - MulIdx;
15277 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
15278 Src2s.
push_back(TempNode->getOperand(AddIdx));
15288 TempNode->getOperand(AddIdx), *Src0, *Src1,
15289 TempNode->getOperand(AddIdx)->getOperand(0),
15290 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15294 if (*IterIsSigned != *IsSigned)
15298 ChainLength =
I + 2;
15302 TempNode = TempNode->getOperand(AddIdx);
15304 ChainLength =
I + 1;
15305 if (TempNode->getNumOperands() < 2)
15307 LHS = TempNode->getOperand(0);
15308 RHS = TempNode->getOperand(1);
15311 if (ChainLength < 2)
15317 if (ChainLength < 4) {
15327 bool UseOriginalSrc =
false;
15328 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
15329 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
15330 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
15331 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
15333 auto Src0Mask = Src0s.
begin()->PermMask;
15334 SrcBytes.
push_back(Src0Mask & 0xFF000000);
15335 bool UniqueEntries =
true;
15336 for (
auto I = 1;
I < 4;
I++) {
15337 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
15340 UniqueEntries =
false;
15346 if (UniqueEntries) {
15347 UseOriginalSrc =
true;
15349 auto *FirstElt = Src0s.
begin();
15353 auto *SecondElt = Src1s.
begin();
15355 SecondElt->DWordOffset);
15364 if (!UseOriginalSrc) {
15371 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
15374 : Intrinsic::amdgcn_udot4,
15384 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
15389 unsigned Opc =
LHS.getOpcode();
15401 auto Cond =
RHS.getOperand(0);
15423 DAGCombinerInfo &DCI)
const {
15426 EVT VT =
N->getValueType(0);
15448 DCI.AddToWorklist(Inner.
getNode());
15456 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
15465 if (VT == MVT::i64) {
15466 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15475 dyn_cast<GlobalAddressSDNode>(GAValue)) {
15482 DCI.AddToWorklist(Inner.
getNode());
15513 if (ZIsConstant != YIsConstant) {
15517 DCI.AddToWorklist(Inner.
getNode());
15525 assert(!YIsConstant && !ZIsConstant);
15527 if (!
X->isDivergent() &&
Y->isDivergent() !=
Z->isDivergent()) {
15536 if (
Y->isDivergent())
15539 DCI.AddToWorklist(UniformInner.
getNode());
15547 DAGCombinerInfo &DCI)
const {
15549 EVT VT =
N->getValueType(0);
15551 if (VT == MVT::i64) {
15552 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15556 if (VT != MVT::i32)
15565 unsigned Opc =
RHS.getOpcode();
15572 auto Cond =
RHS.getOperand(0);
15595SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
15596 DAGCombinerInfo &DCI)
const {
15598 if (
N->getValueType(0) != MVT::i32)
15609 unsigned LHSOpc =
LHS.getOpcode();
15610 unsigned Opc =
N->getOpcode();
15620 DAGCombinerInfo &DCI)
const {
15625 EVT VT =
N->getValueType(0);
15637 if (
A ==
LHS.getOperand(1)) {
15638 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
15639 if (FusedOp != 0) {
15641 return DAG.
getNode(FusedOp, SL, VT,
A, Two, RHS);
15649 if (
A ==
RHS.getOperand(1)) {
15650 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
15651 if (FusedOp != 0) {
15653 return DAG.
getNode(FusedOp, SL, VT,
A, Two, LHS);
15662 DAGCombinerInfo &DCI)
const {
15668 EVT VT =
N->getValueType(0);
15681 if (
A ==
LHS.getOperand(1)) {
15682 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
15683 if (FusedOp != 0) {
15687 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
15696 if (
A ==
RHS.getOperand(1)) {
15697 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
15698 if (FusedOp != 0) {
15700 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo, LHS);
15709 DAGCombinerInfo &DCI)
const {
15712 EVT VT =
N->getValueType(0);
15713 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->
has16BitInsts())
15726 bool IsNegative =
false;
15727 if (CLHS->isExactlyValue(1.0) ||
15728 (IsNegative = CLHS->isExactlyValue(-1.0))) {
15744 DAGCombinerInfo &DCI)
const {
15746 EVT VT =
N->getValueType(0);
15750 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
15751 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
15766 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
15781 if (ScalarVT == MVT::f32 &&
15787 if (TrueNodeExpVal == INT_MIN)
15790 if (FalseNodeExpVal == INT_MIN)
15810 DAGCombinerInfo &DCI)
const {
15812 EVT VT =
N->getValueType(0);
15833 (
N->getFlags().hasAllowContract() &&
15834 FMA->getFlags().hasAllowContract())) {
15868 if (Vec1 == Vec2 || Vec3 == Vec4)
15874 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
15883 DAGCombinerInfo &DCI)
const {
15889 EVT VT =
LHS.getValueType();
15890 ISD::CondCode CC = cast<CondCodeSDNode>(
N->getOperand(2))->get();
15892 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15894 CRHS = dyn_cast<ConstantSDNode>(LHS);
15918 return LHS.getOperand(0);
15924 isa<ConstantSDNode>(
LHS.getOperand(1)) &&
15925 isa<ConstantSDNode>(
LHS.getOperand(2)) &&
15926 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
15933 const APInt &CT =
LHS.getConstantOperandAPInt(1);
15934 const APInt &CF =
LHS.getConstantOperandAPInt(2);
15942 return LHS.getOperand(0);
15946 if (VT != MVT::f32 && VT != MVT::f64 &&
15962 const unsigned IsInfMask =
15964 const unsigned IsFiniteMask =
15978SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
15979 DAGCombinerInfo &DCI)
const {
15997 if (
auto *
C = dyn_cast<ConstantSDNode>(Shift.
getOperand(1))) {
16001 unsigned ShiftOffset = 8 *
Offset;
16003 ShiftOffset -=
C->getZExtValue();
16005 ShiftOffset +=
C->getZExtValue();
16007 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16009 MVT::f32, Shifted);
16020 DCI.AddToWorklist(
N);
16027 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
16033 DAGCombinerInfo &DCI)
const {
16043 return DCI.DAG.getConstantFP(Zero,
SDLoc(
N),
N->getValueType(0));
16046 APFloat One(
F.getSemantics(),
"1.0");
16048 return DCI.DAG.getConstantFP(One,
SDLoc(
N),
N->getValueType(0));
16054 DAGCombinerInfo &DCI)
const {
16075 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
16076 bool isInteger =
LHS.getValueType().isInteger();
16079 if (!isFloatingPoint && !isInteger)
16084 if (!isEquality && !isNonEquality)
16088 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16089 (isInteger && isa<ConstantSDNode>(RHS))) {
16092 }
else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16093 (isInteger && isa<ConstantSDNode>(LHS))) {
16101 if (isFloatingPoint) {
16102 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16107 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16114 if (!(isEquality && TrueVal == ConstVal) &&
16115 !(isNonEquality && FalseVal == ConstVal))
16118 SDValue SelectLHS = (isEquality &&
TrueVal == ConstVal) ? ArgVal : TrueVal;
16120 (isNonEquality &&
FalseVal == ConstVal) ? ArgVal : FalseVal;
16122 SelectLHS, SelectRHS);
16127 switch (
N->getOpcode()) {
16143 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
16153 switch (
N->getOpcode()) {
16155 return performAddCombine(
N, DCI);
16157 return performPtrAddCombine(
N, DCI);
16159 return performSubCombine(
N, DCI);
16162 return performAddCarrySubCarryCombine(
N, DCI);
16164 return performFAddCombine(
N, DCI);
16166 return performFSubCombine(
N, DCI);
16168 return performFDivCombine(
N, DCI);
16170 return performFMulCombine(
N, DCI);
16172 return performSetCCCombine(
N, DCI);
16174 if (
auto Res = performSelectCombine(
N, DCI))
16191 return performMinMaxCombine(
N, DCI);
16193 return performFMACombine(
N, DCI);
16195 return performAndCombine(
N, DCI);
16197 return performOrCombine(
N, DCI);
16200 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
16201 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16207 return performXorCombine(
N, DCI);
16209 return performZeroExtendCombine(
N, DCI);
16211 return performSignExtendInRegCombine(
N, DCI);
16213 return performClassCombine(
N, DCI);
16215 return performFCanonicalizeCombine(
N, DCI);
16217 return performRcpCombine(
N, DCI);
16232 return performUCharToFloatCombine(
N, DCI);
16234 return performFCopySignCombine(
N, DCI);
16239 return performCvtF32UByteNCombine(
N, DCI);
16241 return performFMed3Combine(
N, DCI);
16243 return performCvtPkRTZCombine(
N, DCI);
16245 return performClampCombine(
N, DCI);
16248 EVT VT =
N->getValueType(0);
16251 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16254 EVT EltVT = Src.getValueType();
16255 if (EltVT != MVT::i16)
16265 return performExtractVectorEltCombine(
N, DCI);
16267 return performInsertVectorEltCombine(
N, DCI);
16269 return performFPRoundCombine(
N, DCI);
16271 if (
SDValue Widened = widenLoad(cast<LoadSDNode>(
N), DCI))
16277 if (
MemSDNode *MemNode = dyn_cast<MemSDNode>(
N))
16278 return performMemSDNodeCombine(MemNode, DCI);
16309 unsigned Opcode =
Node->getMachineOpcode();
16312 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16313 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
16318 unsigned DmaskIdx =
16319 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16320 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
16321 unsigned NewDmask = 0;
16322 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16323 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16324 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
16325 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
16326 unsigned TFCLane = 0;
16327 bool HasChain =
Node->getNumValues() > 1;
16329 if (OldDmask == 0) {
16337 TFCLane = OldBitsSet;
16344 if (
Use.getResNo() != 0)
16350 if (!
User->isMachineOpcode() ||
16351 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
16363 if (UsesTFC && Lane == TFCLane) {
16368 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
16370 Dmask &= ~(1 << Comp);
16378 NewDmask |= 1 << Comp;
16383 bool NoChannels = !NewDmask;
16390 if (OldBitsSet == 1)
16396 if (NewDmask == OldDmask)
16405 unsigned NewChannels = BitsSet + UsesTFC;
16409 assert(NewOpcode != -1 &&
16410 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
16411 "failed to find equivalent MIMG op");
16419 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
16421 MVT ResultVT = NewChannels == 1
16424 : NewChannels == 5 ? 8
16438 if (NewChannels == 1) {
16448 for (
unsigned i = 0,
Idx = AMDGPU::sub0; i < 5; ++i) {
16453 if (i || !NoChannels)
16458 if (NewUser !=
User) {
16468 Idx = AMDGPU::sub1;
16471 Idx = AMDGPU::sub2;
16474 Idx = AMDGPU::sub3;
16477 Idx = AMDGPU::sub4;
16488 Op =
Op.getOperand(0);
16490 return isa<FrameIndexSDNode>(
Op);
16500 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
16501 SDValue SrcVal = Node->getOperand(2);
16509 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
16511 SDNode *Glued = Node->getGluedNode();
16513 Node->getOperand(0), SL, VReg, SrcVal,
16519 return ToResultReg.
getNode();
16524 for (
unsigned i = 0; i < Node->getNumOperands(); ++i) {
16532 Node->getOperand(i).getValueType(),
16533 Node->getOperand(i)),
16545 unsigned Opcode = Node->getMachineOpcode();
16547 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
16548 !
TII->isGather4(Opcode) &&
16550 return adjustWritemask(Node, DAG);
16553 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
16559 case AMDGPU::V_DIV_SCALE_F32_e64:
16560 case AMDGPU::V_DIV_SCALE_F64_e64: {
16564 SDValue Src0 = Node->getOperand(1);
16565 SDValue Src1 = Node->getOperand(3);
16566 SDValue Src2 = Node->getOperand(5);
16570 (Src0 == Src1 || Src0 == Src2))
16626 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
16627 unsigned InitIdx = 0;
16629 if (
TII->isImage(
MI)) {
16637 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
16638 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
16639 unsigned D16Val = D16 ? D16->getImm() : 0;
16641 if (!TFEVal && !LWEVal)
16652 assert(MO_Dmask &&
"Expected dmask operand in instruction");
16654 unsigned dmask = MO_Dmask->
getImm();
16661 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
16667 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
16668 if (DstSize < InitIdx)
16671 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
16679 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
16680 unsigned NewDst = 0;
16689 for (; SizeLeft; SizeLeft--, CurrIdx++) {
16690 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
16710 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
16723 if (
TII->isVOP3(
MI.getOpcode())) {
16725 TII->legalizeOperandsVOP3(
MRI,
MI);
16730 if (!
MI.getDesc().operands().empty()) {
16731 unsigned Opc =
MI.getOpcode();
16732 bool HasAGPRs =
Info->mayNeedAGPRs();
16734 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
16736 {AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src0),
16737 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1), Src2Idx}) {
16740 if ((
I == Src2Idx) && (HasAGPRs))
16743 if (!
Op.isReg() || !
Op.getReg().isVirtual())
16745 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
16746 if (!
TRI->hasAGPRs(RC))
16748 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
16749 if (!Src || !Src->isCopy() ||
16750 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
16752 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
16756 MRI.setRegClass(
Op.getReg(), NewRC);
16759 if (
TII->isMAI(
MI)) {
16764 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
16765 AMDGPU::OpName::scale_src0);
16766 if (Src0Idx != -1) {
16767 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
16768 AMDGPU::OpName::scale_src1);
16769 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
16770 TII->usesConstantBus(
MRI,
MI, Src1Idx))
16771 TII->legalizeOpWithMove(
MI, Src1Idx);
16779 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
16780 if (Src2->isReg() && Src2->getReg().isVirtual()) {
16781 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
16782 if (
TRI->isVectorSuperClass(RC)) {
16783 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
16784 MRI.setRegClass(Src2->getReg(), NewRC);
16785 if (Src2->isTied())
16786 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
16795 if (
TII->isImage(
MI))
16796 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
16870std::pair<unsigned, const TargetRegisterClass *>
16877 if (Constraint.
size() == 1) {
16881 if (VT == MVT::Other)
16884 switch (Constraint[0]) {
16891 RC = &AMDGPU::SReg_32RegClass;
16894 RC = &AMDGPU::SGPR_64RegClass;
16899 return std::pair(0U,
nullptr);
16907 : &AMDGPU::VGPR_32RegClass;
16912 return std::pair(0U,
nullptr);
16921 RC = &AMDGPU::AGPR_32RegClass;
16926 return std::pair(0U,
nullptr);
16935 RC = &AMDGPU::AV_32RegClass;
16938 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
16940 return std::pair(0U,
nullptr);
16949 return std::pair(0U, RC);
16952 if (Kind !=
'\0') {
16954 RC = &AMDGPU::VGPR_32RegClass;
16955 }
else if (Kind ==
's') {
16956 RC = &AMDGPU::SGPR_32RegClass;
16957 }
else if (Kind ==
'a') {
16958 RC = &AMDGPU::AGPR_32RegClass;
16964 return std::pair(0U,
nullptr);
16970 return std::pair(0U,
nullptr);
16974 RC =
TRI->getVGPRClassForBitWidth(Width);
16976 RC =
TRI->getSGPRClassForBitWidth(Width);
16978 RC =
TRI->getAGPRClassForBitWidth(Width);
16980 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
16985 return std::pair(0U,
nullptr);
16987 return std::pair(Reg, RC);
16993 return std::pair(0U,
nullptr);
16994 if (Idx < RC->getNumRegs())
17001 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
17007 if (Constraint.
size() == 1) {
17008 switch (Constraint[0]) {
17018 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
17026 if (Constraint.
size() == 1) {
17027 switch (Constraint[0]) {
17035 }
else if (Constraint.
size() == 2) {
17036 if (Constraint ==
"VA")
17047 Val = Val & maskTrailingOnes<uint64_t>(
Size);
17054 std::vector<SDValue> &Ops,
17069 unsigned Size =
Op.getScalarValueSizeInBits();
17077 Val =
C->getSExtValue();
17081 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17087 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
17090 Val =
C->getSExtValue();
17094 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17104 if (Constraint.
size() == 1) {
17105 switch (Constraint[0]) {
17109 return isInt<16>(Val);
17113 return isInt<32>(Val);
17120 }
else if (Constraint.
size() == 2) {
17121 if (Constraint ==
"DA") {
17122 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
17123 int64_t LoBits =
static_cast<int32_t
>(Val);
17127 if (Constraint ==
"DB") {
17135 unsigned MaxSize)
const {
17136 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
17139 MVT VT =
Op.getSimpleValueType();
17164 switch (UnalignedClassID) {
17165 case AMDGPU::VReg_64RegClassID:
17166 return AMDGPU::VReg_64_Align2RegClassID;
17167 case AMDGPU::VReg_96RegClassID:
17168 return AMDGPU::VReg_96_Align2RegClassID;
17169 case AMDGPU::VReg_128RegClassID:
17170 return AMDGPU::VReg_128_Align2RegClassID;
17171 case AMDGPU::VReg_160RegClassID:
17172 return AMDGPU::VReg_160_Align2RegClassID;
17173 case AMDGPU::VReg_192RegClassID:
17174 return AMDGPU::VReg_192_Align2RegClassID;
17175 case AMDGPU::VReg_224RegClassID:
17176 return AMDGPU::VReg_224_Align2RegClassID;
17177 case AMDGPU::VReg_256RegClassID:
17178 return AMDGPU::VReg_256_Align2RegClassID;
17179 case AMDGPU::VReg_288RegClassID:
17180 return AMDGPU::VReg_288_Align2RegClassID;
17181 case AMDGPU::VReg_320RegClassID:
17182 return AMDGPU::VReg_320_Align2RegClassID;
17183 case AMDGPU::VReg_352RegClassID:
17184 return AMDGPU::VReg_352_Align2RegClassID;
17185 case AMDGPU::VReg_384RegClassID:
17186 return AMDGPU::VReg_384_Align2RegClassID;
17187 case AMDGPU::VReg_512RegClassID:
17188 return AMDGPU::VReg_512_Align2RegClassID;
17189 case AMDGPU::VReg_1024RegClassID:
17190 return AMDGPU::VReg_1024_Align2RegClassID;
17191 case AMDGPU::AReg_64RegClassID:
17192 return AMDGPU::AReg_64_Align2RegClassID;
17193 case AMDGPU::AReg_96RegClassID:
17194 return AMDGPU::AReg_96_Align2RegClassID;
17195 case AMDGPU::AReg_128RegClassID:
17196 return AMDGPU::AReg_128_Align2RegClassID;
17197 case AMDGPU::AReg_160RegClassID:
17198 return AMDGPU::AReg_160_Align2RegClassID;
17199 case AMDGPU::AReg_192RegClassID:
17200 return AMDGPU::AReg_192_Align2RegClassID;
17201 case AMDGPU::AReg_256RegClassID:
17202 return AMDGPU::AReg_256_Align2RegClassID;
17203 case AMDGPU::AReg_512RegClassID:
17204 return AMDGPU::AReg_512_Align2RegClassID;
17205 case AMDGPU::AReg_1024RegClassID:
17206 return AMDGPU::AReg_1024_Align2RegClassID;
17222 if (
Info->isEntryFunction()) {
17229 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17231 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17232 :
TRI->getAlignedHighSGPRForRC(MF, 2,
17233 &AMDGPU::SGPR_64RegClass);
17234 Info->setSGPRForEXECCopy(SReg);
17237 Info->getStackPtrOffsetReg()));
17238 if (
Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17239 MRI.replaceRegWith(AMDGPU::SP_REG,
Info->getStackPtrOffsetReg());
17243 if (
Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17244 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG,
Info->getScratchRSrcReg());
17246 if (
Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17247 MRI.replaceRegWith(AMDGPU::FP_REG,
Info->getFrameOffsetReg());
17249 Info->limitOccupancy(MF);
17251 if (ST.isWave32() && !MF.
empty()) {
17252 for (
auto &
MBB : MF) {
17253 for (
auto &
MI :
MBB) {
17254 TII->fixImplicitOperands(
MI);
17264 if (ST.needsAlignedVGPRs()) {
17265 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
17271 if (NewClassID != -1)
17272 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
17281 const APInt &DemandedElts,
17283 unsigned Depth)
const {
17285 unsigned Opc =
Op.getOpcode();
17288 unsigned IID =
Op.getConstantOperandVal(0);
17290 case Intrinsic::amdgcn_mbcnt_lo:
17291 case Intrinsic::amdgcn_mbcnt_hi: {
17297 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17307 Op, Known, DemandedElts, DAG,
Depth);
17323 unsigned MaxValue =
17330 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
17334 unsigned Src1Cst = 0;
17335 if (Src1.
isImm()) {
17336 Src1Cst = Src1.
getImm();
17337 }
else if (Src1.
isReg()) {
17341 Src1Cst = Cst->Value.getZExtValue();
17349 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
17350 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
17352 if (Width >= BFEWidth)
17361 Known = Known.
sext(BFEWidth);
17363 Known = Known.
zext(BFEWidth);
17369 unsigned Depth)
const {
17372 switch (
MI->getOpcode()) {
17373 case AMDGPU::S_BFE_I32:
17376 case AMDGPU::S_BFE_U32:
17379 case AMDGPU::S_BFE_I64:
17382 case AMDGPU::S_BFE_U64:
17385 case AMDGPU::G_INTRINSIC:
17386 case AMDGPU::G_INTRINSIC_CONVERGENT: {
17389 case Intrinsic::amdgcn_workitem_id_x:
17392 case Intrinsic::amdgcn_workitem_id_y:
17395 case Intrinsic::amdgcn_workitem_id_z:
17398 case Intrinsic::amdgcn_mbcnt_lo:
17399 case Intrinsic::amdgcn_mbcnt_hi: {
17411 case Intrinsic::amdgcn_groupstaticsize: {
17422 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
17425 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
17428 case AMDGPU::G_AMDGPU_SMED3:
17429 case AMDGPU::G_AMDGPU_UMED3: {
17430 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
17457 unsigned Depth)
const {
17459 if (
auto *GI = dyn_cast<GIntrinsic>(
MI)) {
17466 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
17493 if (Header->getAlignment() != PrefAlign)
17494 return Header->getAlignment();
17496 unsigned LoopSize = 0;
17504 LoopSize +=
TII->getInstSizeInBytes(
MI);
17505 if (LoopSize > 192)
17510 if (LoopSize <= 64)
17513 if (LoopSize <= 128)
17514 return CacheLineAlign;
17520 auto I = Exit->getFirstNonDebugInstr();
17521 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
17522 return CacheLineAlign;
17531 if (PreTerm == Pre->
begin() ||
17532 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
17536 auto ExitHead = Exit->getFirstNonDebugInstr();
17537 if (ExitHead == Exit->end() ||
17538 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
17543 return CacheLineAlign;
17551 N =
N->getOperand(0).getNode();
17561 switch (
N->getOpcode()) {
17569 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
17570 return !
TRI->isSGPRReg(
MRI, Reg);
17576 return !
TRI->isSGPRReg(
MRI, Reg);
17580 unsigned AS = L->getAddressSpace();
17611 if (
auto *
A = dyn_cast<AtomicSDNode>(
N)) {
17613 return A->readMem() &&
A->writeMem();
17646 const APInt &DemandedElts,
17649 unsigned Depth)
const {
17654 if (
Info->getMode().DX10Clamp)
17666 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
17686 <<
"Hardware instruction generated for atomic "
17688 <<
" operation at memory scope " << MemScope;
17692 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
17693 Type *EltTy = VT->getElementType();
17694 return VT->getNumElements() == 2 &&
17713 if (
auto *
IT = dyn_cast<IntegerType>(Ty)) {
17714 unsigned BW =
IT->getBitWidth();
17715 return BW == 32 || BW == 64;
17727 if (
PointerType *PT = dyn_cast<PointerType>(Ty)) {
17729 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
17730 return BW == 32 || BW == 64;
17737 return VT->getNumElements() == 2 &&
17738 VT->getElementType()->getPrimitiveSizeInBits() == 16;
17748 bool HasSystemScope) {
17755 if (HasSystemScope) {
17764 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
17777 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
17803 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
17816 bool HasSystemScope =
17858 if (!HasSystemScope &&
17871 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
17879 ConstVal && ConstVal->isNullValue())
18099 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18100 return Subtarget->
isWave64() ? &AMDGPU::SReg_64RegClass
18101 : &AMDGPU::SReg_32RegClass;
18102 if (!
TRI->isSGPRClass(RC) && !isDivergent)
18103 return TRI->getEquivalentSGPRClass(RC);
18104 if (
TRI->isSGPRClass(RC) && isDivergent)
18105 return TRI->getEquivalentVGPRClass(RC);
18117 unsigned WaveSize) {
18122 if (!
IT ||
IT->getBitWidth() != WaveSize)
18125 if (!isa<Instruction>(V))
18127 if (!Visited.
insert(V).second)
18129 bool Result =
false;
18130 for (
const auto *U : V->users()) {
18131 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
18132 if (V == U->getOperand(1)) {
18133 switch (Intrinsic->getIntrinsicID()) {
18137 case Intrinsic::amdgcn_if_break:
18138 case Intrinsic::amdgcn_if:
18139 case Intrinsic::amdgcn_else:
18144 if (V == U->getOperand(0)) {
18145 switch (Intrinsic->getIntrinsicID()) {
18149 case Intrinsic::amdgcn_end_cf:
18150 case Intrinsic::amdgcn_loop:
18156 Result =
hasCFUser(U, Visited, WaveSize);
18165 const Value *V)
const {
18166 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
18167 if (CI->isInlineAsm()) {
18176 for (
auto &TC : TargetConstraints) {
18218 return MRI.hasOneNonDBGUse(N0);
18225 if (
I.getMetadata(
"amdgpu.noclobber"))
18227 if (
I.getMetadata(
"amdgpu.last.use"))
18237 if (!Def->isMachineOpcode())
18247 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18248 PhysReg = AMDGPU::SCC;
18250 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18305 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
18316 Alignment = RMW->getAlign();
18329 bool FullFlatEmulation =
18333 RMW->getType()->isDoubleTy()));
18336 bool ReturnValueIsUsed = !AI->
use_empty();
18345 if (FullFlatEmulation) {
18356 std::prev(BB->
end())->eraseFromParent();
18359 Value *LoadedShared =
nullptr;
18360 if (FullFlatEmulation) {
18362 {
Addr},
nullptr,
"is.shared");
18363 Builder.
CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
18371 LoadedShared = Clone;
18378 {
Addr},
nullptr,
"is.private");
18386 Value *LoadedPrivate;
18389 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
18392 LoadedPrivate, RMW->getValOperand());
18396 auto [ResultLoad, Equal] =
18411 if (FullFlatEmulation) {
18421 if (!FullFlatEmulation) {
18426 MDNode *RangeNotPrivate =
18429 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
18437 if (ReturnValueIsUsed) {
18440 if (FullFlatEmulation)
18451 unsigned PtrOpIdx) {
18452 Value *PtrOp =
I->getOperand(PtrOpIdx);
18459 I->setOperand(PtrOpIdx, ASCast);
18470 if (
const auto *ConstVal = dyn_cast<Constant>(AI->
getValOperand());
18471 ConstVal && ConstVal->isNullValue()) {
18501 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
18509 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static uint32_t getIdentityValueForWaveReduction(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
bool hasBF16PackedInsts() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
bool hasBF16TransInsts() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
bool isFPPredicate() const
bool isIntPredicate() const
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMemoryAtomicFaddF32DenormalSupport() const
bool hasD16Images() const
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasAtomicFMinFMaxF64FlatInsts() const
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasAtomicFMinFMaxF32FlatInsts() const
bool hasIEEEMinimumMaximumInsts() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasRelaxedBufferOOBMode() const
bool hasBCNT(unsigned Size) const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDot1Insts() const
bool hasAtomicFaddRtnInsts() const
bool hasSafeSmemPrefetch() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
bool enableFlatScratch() const
bool hasMin3Max3PKF16() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool hasFmaMixBF16Insts() const
bool hasVMemToLDSLoad() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasBVHDualAndBVH8Insts() const
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
bool hasGloballyAddressableScratch() const
bool has64BitLiterals() const
TrapHandlerAbi getTrapHandlerAbi() const
const SIFrameLowering * getFrameLowering() const override
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasGFX10_AEncoding() const
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool hasEmulatedSystemScopeAtomics() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasIntMinMax64() const
bool hasShaderCyclesHiLoRegisters() const
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
bool hasVmemPrefInsts() const
bool hasAtomicFMinFMaxF64GlobalInsts() const
bool hasUnalignedScratchAccessEnabled() const
bool hasAtomicFlatPkAdd16Insts() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
bool supportsMinMaxDenormModes() const
bool hasAtomicFaddInsts() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
bool hasAtomicBufferPkAddBF16Inst() const
bool hasAtomicFaddNoRtnInsts() const
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDot8Insts() const
bool hasVectorMulU64() const
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasAtomicBufferGlobalPkAddF16Insts() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasAtomicGlobalPkAddBF16Inst() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
bool hasAddNoCarry() const
bool hasGWSAutoReplay() const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
BasicBlock::iterator GetInsertPoint() const
BasicBlock * GetInsertBlock() const
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
LLVMContext & getContext() const
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
bool isInlineConstant(const APInt &Imm) const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ Mul
Product of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const