40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
52#define DEBUG_TYPE "si-lower"
58 cl::desc(
"Do not align and prefetch loops"),
62 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
63 cl::desc(
"Use indirect register addressing for divergent indexes"),
70 cl::desc(
"Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
85 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
86 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
88 return AMDGPU::SGPR0 + Reg;
203 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
204 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
205 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
206 MVT::i1, MVT::v32i32},
210 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
211 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
212 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
213 MVT::i1, MVT::v32i32},
282 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
289 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
290 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
291 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
294 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
295 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
296 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
300 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
301 MVT::v3i16, MVT::v4i16, MVT::Other},
306 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
322 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
323 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
324 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
325 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
326 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
327 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
328 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
329 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
361 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
375 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
389 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
403 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
417 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
432 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
433 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
449 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
450 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
455 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
459 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
460 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
461 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
462 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
538 {MVT::f32, MVT::f64},
Custom);
544 {MVT::f32, MVT::f64},
Legal);
641 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
642 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
643 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
776 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
777 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
778 MVT::v32f16, MVT::v32bf16},
788 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
792 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
796 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
797 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
816 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
819 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
820 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
821 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
824 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
832 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
839 {MVT::v2f16, MVT::v4f16},
Custom);
849 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
869 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
870 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
871 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
872 MVT::v32f16, MVT::v32bf16},
890 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
908 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
917 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
918 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
923 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
924 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
925 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
926 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
930 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
931 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
932 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
933 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
965 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1054 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1067 EVT DestVT,
EVT SrcVT)
const {
1079 LLT DestTy,
LLT SrcTy)
const {
1080 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
1081 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
1107 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1109 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1136 return (NumElts + 1) / 2;
1142 return NumElts * ((
Size + 31) / 32);
1151 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1160 if (ScalarVT == MVT::bf16) {
1161 RegisterVT = MVT::i32;
1162 IntermediateVT = MVT::v2bf16;
1164 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1165 IntermediateVT = RegisterVT;
1167 NumIntermediates = (NumElts + 1) / 2;
1168 return NumIntermediates;
1173 IntermediateVT = RegisterVT;
1174 NumIntermediates = NumElts;
1175 return NumIntermediates;
1178 if (Size < 16 && Subtarget->has16BitInsts()) {
1180 RegisterVT = MVT::i16;
1181 IntermediateVT = ScalarVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1187 RegisterVT = MVT::i32;
1188 IntermediateVT = ScalarVT;
1189 NumIntermediates = NumElts;
1190 return NumIntermediates;
1194 RegisterVT = MVT::i32;
1195 IntermediateVT = RegisterVT;
1196 NumIntermediates = NumElts * ((
Size + 31) / 32);
1197 return NumIntermediates;
1202 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1207 unsigned MaxNumLanes) {
1208 assert(MaxNumLanes != 0);
1211 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1212 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1223 unsigned MaxNumLanes) {
1224 auto *ST = dyn_cast<StructType>(Ty);
1229 assert(ST->getNumContainedTypes() == 2 &&
1230 ST->getContainedType(1)->isIntegerTy(32));
1244 return MVT::amdgpuBufferFatPointer;
1246 DL.getPointerSizeInBits(AS) == 192)
1247 return MVT::amdgpuBufferStridedPointer;
1256 DL.getPointerSizeInBits(AS) == 160) ||
1258 DL.getPointerSizeInBits(AS) == 192))
1265 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1266 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1268 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1269 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1271 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1272 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1274 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1275 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1285 unsigned IntrID)
const {
1287 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1305 if (RsrcIntr->IsImage) {
1313 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1320 Info.ptrVal = RsrcArg;
1323 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1332 if (RsrcIntr->IsImage) {
1333 unsigned MaxNumLanes = 4;
1348 std::numeric_limits<unsigned>::max());
1358 if (RsrcIntr->IsImage) {
1359 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1379 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1381 Info.memVT = MVT::i32;
1388 case Intrinsic::amdgcn_raw_buffer_load_lds:
1389 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1390 case Intrinsic::amdgcn_struct_buffer_load_lds:
1391 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1392 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1397 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1398 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1399 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1400 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1403 std::numeric_limits<unsigned>::max());
1413 case Intrinsic::amdgcn_ds_ordered_add:
1414 case Intrinsic::amdgcn_ds_ordered_swap: {
1427 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1428 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1431 Info.ptrVal =
nullptr;
1436 case Intrinsic::amdgcn_ds_append:
1437 case Intrinsic::amdgcn_ds_consume: {
1450 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1451 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1452 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1457 Info.memVT = MVT::i64;
1463 case Intrinsic::amdgcn_global_atomic_csub: {
1472 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1473 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1474 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1477 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1479 : cast<StructType>(CI.
getType())
1480 ->getElementType(0));
1488 case Intrinsic::amdgcn_global_atomic_fmin_num:
1489 case Intrinsic::amdgcn_global_atomic_fmax_num:
1490 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1491 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1492 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1493 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1503 case Intrinsic::amdgcn_flat_load_monitor_b32:
1504 case Intrinsic::amdgcn_flat_load_monitor_b64:
1505 case Intrinsic::amdgcn_flat_load_monitor_b128:
1506 case Intrinsic::amdgcn_global_load_monitor_b32:
1507 case Intrinsic::amdgcn_global_load_monitor_b64:
1508 case Intrinsic::amdgcn_global_load_monitor_b128:
1509 case Intrinsic::amdgcn_ds_load_tr6_b96:
1510 case Intrinsic::amdgcn_ds_load_tr4_b64:
1511 case Intrinsic::amdgcn_ds_load_tr8_b64:
1512 case Intrinsic::amdgcn_ds_load_tr16_b128:
1513 case Intrinsic::amdgcn_global_load_tr6_b96:
1514 case Intrinsic::amdgcn_global_load_tr4_b64:
1515 case Intrinsic::amdgcn_global_load_tr_b64:
1516 case Intrinsic::amdgcn_global_load_tr_b128:
1517 case Intrinsic::amdgcn_ds_read_tr4_b64:
1518 case Intrinsic::amdgcn_ds_read_tr6_b96:
1519 case Intrinsic::amdgcn_ds_read_tr8_b64:
1520 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1528 case Intrinsic::amdgcn_ds_gws_init:
1529 case Intrinsic::amdgcn_ds_gws_barrier:
1530 case Intrinsic::amdgcn_ds_gws_sema_v:
1531 case Intrinsic::amdgcn_ds_gws_sema_br:
1532 case Intrinsic::amdgcn_ds_gws_sema_p:
1533 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1543 Info.memVT = MVT::i32;
1547 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1553 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1554 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1555 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1556 case Intrinsic::amdgcn_global_load_async_to_lds_b128: {
1563 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1564 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1565 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1566 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1573 case Intrinsic::amdgcn_load_to_lds:
1574 case Intrinsic::amdgcn_global_load_lds: {
1576 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1582 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1583 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1584 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1585 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1595 Info.memVT = MVT::i32;
1602 case Intrinsic::amdgcn_s_prefetch_data:
1603 case Intrinsic::amdgcn_flat_prefetch:
1604 case Intrinsic::amdgcn_global_prefetch: {
1619 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1622 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1623 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1635 Type *&AccessTy)
const {
1637 switch (
II->getIntrinsicID()) {
1638 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1639 case Intrinsic::amdgcn_ds_append:
1640 case Intrinsic::amdgcn_ds_consume:
1641 case Intrinsic::amdgcn_ds_load_tr8_b64:
1642 case Intrinsic::amdgcn_ds_load_tr16_b128:
1643 case Intrinsic::amdgcn_ds_load_tr4_b64:
1644 case Intrinsic::amdgcn_ds_load_tr6_b96:
1645 case Intrinsic::amdgcn_ds_read_tr4_b64:
1646 case Intrinsic::amdgcn_ds_read_tr6_b96:
1647 case Intrinsic::amdgcn_ds_read_tr8_b64:
1648 case Intrinsic::amdgcn_ds_read_tr16_b64:
1649 case Intrinsic::amdgcn_ds_ordered_add:
1650 case Intrinsic::amdgcn_ds_ordered_swap:
1651 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1652 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1653 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1654 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1655 case Intrinsic::amdgcn_flat_load_monitor_b128:
1656 case Intrinsic::amdgcn_flat_load_monitor_b32:
1657 case Intrinsic::amdgcn_flat_load_monitor_b64:
1658 case Intrinsic::amdgcn_global_atomic_csub:
1659 case Intrinsic::amdgcn_global_atomic_fmax_num:
1660 case Intrinsic::amdgcn_global_atomic_fmin_num:
1661 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1662 case Intrinsic::amdgcn_global_load_monitor_b128:
1663 case Intrinsic::amdgcn_global_load_monitor_b32:
1664 case Intrinsic::amdgcn_global_load_monitor_b64:
1665 case Intrinsic::amdgcn_global_load_tr_b64:
1666 case Intrinsic::amdgcn_global_load_tr_b128:
1667 case Intrinsic::amdgcn_global_load_tr4_b64:
1668 case Intrinsic::amdgcn_global_load_tr6_b96:
1669 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1670 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1671 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1672 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1673 Ptr =
II->getArgOperand(0);
1675 case Intrinsic::amdgcn_load_to_lds:
1676 case Intrinsic::amdgcn_global_load_lds:
1677 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1678 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1679 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1680 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1681 Ptr =
II->getArgOperand(1);
1686 AccessTy =
II->getType();
1692 unsigned AddrSpace)
const {
1704 return AM.
Scale == 0 &&
1706 AM.
BaseOffs, AddrSpace, FlatVariant));
1726 return isLegalMUBUFAddressingMode(AM);
1729bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1740 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1752 if (AM.HasBaseReg) {
1784 return isLegalMUBUFAddressingMode(AM);
1791 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1841 : isLegalMUBUFAddressingMode(AM);
1888 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1900 Align RequiredAlignment(
1903 Alignment < RequiredAlignment)
1924 RequiredAlignment =
Align(4);
1942 *IsFast = (Alignment >= RequiredAlignment) ? 64
1943 : (Alignment <
Align(4)) ? 32
1965 *IsFast = (Alignment >= RequiredAlignment) ? 96
1966 : (Alignment <
Align(4)) ? 32
1979 RequiredAlignment =
Align(8);
1990 *IsFast = (Alignment >= RequiredAlignment) ? 128
1991 : (Alignment <
Align(4)) ? 32
2008 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2010 return Alignment >= RequiredAlignment ||
2019 bool AlignedBy4 = Alignment >=
Align(4);
2021 *IsFast = AlignedBy4;
2032 return Alignment >=
Align(4) ||
2060 return Size >= 32 && Alignment >=
Align(4);
2065 unsigned *IsFast)
const {
2067 Alignment, Flags, IsFast);
2078 if (
Op.size() >= 16 &&
2082 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2090 const MemSDNode *MemNode = cast<MemSDNode>(
N);
2100 unsigned DestAS)
const {
2115 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2133 unsigned Index)
const {
2176 auto [InputPtrReg, RC, ArgTy] =
2186 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2192 const SDLoc &SL)
const {
2199 const SDLoc &SL)
const {
2202 std::optional<uint32_t> KnownSize =
2204 if (KnownSize.has_value())
2230 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2239SDValue SITargetLowering::lowerKernargMemParameter(
2251 int64_t OffsetDiff =
Offset - AlignDownOffset;
2257 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2267 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2277 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2325 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2330SDValue SITargetLowering::getPreloadedValue(
2353 Reg = &WorkGroupIDX;
2354 RC = &AMDGPU::SReg_32RegClass;
2358 Reg = &WorkGroupIDY;
2359 RC = &AMDGPU::SReg_32RegClass;
2363 Reg = &WorkGroupIDZ;
2364 RC = &AMDGPU::SReg_32RegClass;
2395 for (
unsigned I = 0, E = Ins.size(), PSInputNum = 0;
I != E; ++
I) {
2399 "vector type argument should have been split");
2404 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2412 "unexpected vector split in ps argument type");
2426 Info->markPSInputAllocated(PSInputNum);
2428 Info->markPSInputEnabled(PSInputNum);
2444 if (
Info.hasWorkItemIDX()) {
2454 if (
Info.hasWorkItemIDY()) {
2457 Info.setWorkItemIDY(
2460 unsigned Reg = AMDGPU::VGPR1;
2468 if (
Info.hasWorkItemIDZ()) {
2471 Info.setWorkItemIDZ(
2474 unsigned Reg = AMDGPU::VGPR2;
2494 if (RegIdx == ArgVGPRs.
size()) {
2501 unsigned Reg = ArgVGPRs[RegIdx];
2503 assert(Reg != AMDGPU::NoRegister);
2513 unsigned NumArgRegs) {
2516 if (RegIdx == ArgSGPRs.
size())
2519 unsigned Reg = ArgSGPRs[RegIdx];
2521 assert(Reg != AMDGPU::NoRegister);
2535 assert(Reg != AMDGPU::NoRegister);
2561 const unsigned Mask = 0x3ff;
2564 if (
Info.hasWorkItemIDX()) {
2566 Info.setWorkItemIDX(Arg);
2569 if (
Info.hasWorkItemIDY()) {
2571 Info.setWorkItemIDY(Arg);
2574 if (
Info.hasWorkItemIDZ())
2586 const unsigned Mask = 0x3ff;
2607 if (
Info.hasImplicitArgPtr())
2615 if (
Info.hasWorkGroupIDX())
2618 if (
Info.hasWorkGroupIDY())
2621 if (
Info.hasWorkGroupIDZ())
2624 if (
Info.hasLDSKernelId())
2636 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2643 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2649 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2655 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2670 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2676 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2682 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2699 bool InPreloadSequence =
true;
2701 bool AlignedForImplictArgs =
false;
2702 unsigned ImplicitArgOffset = 0;
2703 for (
auto &Arg :
F.args()) {
2704 if (!InPreloadSequence || !Arg.hasInRegAttr())
2707 unsigned ArgIdx = Arg.getArgNo();
2710 if (InIdx < Ins.size() &&
2711 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2714 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2715 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2717 assert(ArgLocs[ArgIdx].isMemLoc());
2718 auto &ArgLoc = ArgLocs[InIdx];
2720 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2722 unsigned NumAllocSGPRs =
2723 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2726 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2727 if (!AlignedForImplictArgs) {
2729 alignTo(LastExplicitArgOffset,
2731 LastExplicitArgOffset;
2732 AlignedForImplictArgs =
true;
2734 ArgOffset += ImplicitArgOffset;
2738 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2739 assert(InIdx >= 1 &&
"No previous SGPR");
2740 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2741 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2745 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2746 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2749 InPreloadSequence =
false;
2755 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2757 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2759 if (PreloadRegs->
size() > 1)
2760 RC = &AMDGPU::SGPR_32RegClass;
2761 for (
auto &Reg : *PreloadRegs) {
2767 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2776 if (
Info.hasLDSKernelId()) {
2778 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2787 bool IsShader)
const {
2795 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2797 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2801 unsigned NumRequiredSystemSGPRs =
2802 Info.hasWorkGroupIDX() +
Info.hasWorkGroupIDY() +
2803 Info.hasWorkGroupIDZ() +
Info.hasWorkGroupInfo();
2804 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2806 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2811 if (!HasArchitectedSGPRs) {
2812 if (
Info.hasWorkGroupIDX()) {
2814 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2818 if (
Info.hasWorkGroupIDY()) {
2820 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2824 if (
Info.hasWorkGroupIDZ()) {
2826 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2831 if (
Info.hasWorkGroupInfo()) {
2833 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2837 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2839 unsigned PrivateSegmentWaveByteOffsetReg;
2842 PrivateSegmentWaveByteOffsetReg =
2843 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2847 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2849 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2852 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2854 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2855 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2859 Info.getNumPreloadedSGPRs() >= 16);
2874 if (HasStackObjects)
2875 Info.setHasNonSpillStackObjects(
true);
2880 HasStackObjects =
true;
2884 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2886 if (!ST.enableFlatScratch()) {
2887 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2894 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2896 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2906 Info.setScratchRSrcReg(ReservedBufferReg);
2925 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2926 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2933 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2934 if (!
MRI.isLiveIn(Reg)) {
2935 Info.setStackPtrOffsetReg(Reg);
2940 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2947 if (ST.getFrameLowering()->hasFP(MF)) {
2948 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2954 return !
Info->isEntryFunction();
2964 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2973 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2974 RC = &AMDGPU::SGPR_64RegClass;
2975 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2976 RC = &AMDGPU::SGPR_32RegClass;
2982 Entry->addLiveIn(*
I);
2987 for (
auto *Exit : Exits)
2989 TII->get(TargetOpcode::COPY), *
I)
3004 bool IsError =
false;
3008 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3026 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
3027 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
3036 !
Info->hasWorkGroupIDZ());
3039 bool IsWholeWaveFunc =
Info->isWholeWaveFunction();
3057 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
3058 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
3061 Info->markPSInputAllocated(0);
3062 Info->markPSInputEnabled(0);
3073 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
3074 if ((PsInputBits & 0x7F) == 0 ||
3075 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3078 }
else if (IsKernel) {
3081 Splits.
append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3095 }
else if (!IsGraphics) {
3112 Info->setNumWaveDispatchSGPRs(
3114 Info->setNumWaveDispatchVGPRs(
3116 }
else if (
Info->getNumKernargPreloadedSGPRs()) {
3117 Info->setNumWaveDispatchSGPRs(
Info->getNumUserSGPRs());
3122 if (IsWholeWaveFunc) {
3124 {MVT::i1, MVT::Other}, Chain);
3136 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3147 if (IsEntryFunc && VA.
isMemLoc()) {
3170 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
3174 int64_t OffsetDiff =
Offset - AlignDownOffset;
3181 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3192 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3193 Ins[i].Flags.isSExt(), &Ins[i]);
3201 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3204 if (PreloadRegs.
size() == 1) {
3205 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3210 TRI->getRegSizeInBits(*RC)));
3218 for (
auto Reg : PreloadRegs) {
3225 PreloadRegs.size()),
3242 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3243 Ins[i].Flags.isSExt(), &Ins[i]);
3255 "hidden argument in kernel signature was not preloaded",
3261 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3262 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3267 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
3282 if (!IsEntryFunc && VA.
isMemLoc()) {
3283 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3294 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3295 RC = &AMDGPU::VGPR_32RegClass;
3296 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3297 RC = &AMDGPU::SGPR_32RegClass;
3357 Info->setBytesInStackArgArea(StackArgSize);
3359 return Chains.
empty() ? Chain
3382 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3383 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3384 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3408 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
3427 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3428 ++
I, ++RealRVLocIdx) {
3432 SDValue Arg = OutVals[RealRVLocIdx];
3455 ReadFirstLane, Arg);
3462 if (!
Info->isEntryFunction()) {
3468 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3470 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3571 auto &ArgUsageInfo =
3573 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3599 const auto [OutgoingArg, ArgRC, ArgTy] =
3604 const auto [IncomingArg, IncomingArgRC, Ty] =
3606 assert(IncomingArgRC == ArgRC);
3609 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3617 InputReg = getImplicitArgPtr(DAG,
DL);
3619 std::optional<uint32_t> Id =
3621 if (Id.has_value()) {
3632 if (OutgoingArg->isRegister()) {
3633 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3634 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3637 unsigned SpecialArgOffset =
3648 auto [OutgoingArg, ArgRC, Ty] =
3651 std::tie(OutgoingArg, ArgRC, Ty) =
3654 std::tie(OutgoingArg, ArgRC, Ty) =
3669 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3670 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3671 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3703 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3704 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3715 : IncomingArgY ? *IncomingArgY
3722 if (OutgoingArg->isRegister()) {
3724 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3750 if (Callee->isDivergent())
3757 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3761 if (!CallerPreserved)
3764 bool CCMatch = CallerCC == CalleeCC;
3777 if (Arg.hasByValAttr())
3791 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3792 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3801 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3814 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
3816 if (!CCVA.isRegLoc())
3821 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3823 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
3847enum ChainCallArgIdx {
3869 bool UsesDynamicVGPRs =
false;
3870 if (IsChainCallConv) {
3875 auto RequestedExecIt =
3877 return Arg.OrigArgIndex == 2;
3879 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
3881 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
3884 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
3887 "Haven't popped all the special args");
3890 CLI.
Args[ChainCallArgIdx::Exec];
3897 if (
const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
3899 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
3901 ChainCallSpecialArgs.
push_back(Arg.Node);
3904 PushNodeOrTargetConstant(RequestedExecArg);
3909 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
3910 if (FlagsValue.
isZero()) {
3911 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
3913 "no additional args allowed if flags == 0");
3915 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
3921 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
3924 UsesDynamicVGPRs =
true;
3925 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
3926 CLI.
Args.end(), PushNodeOrTargetConstant);
3935 bool IsSibCall =
false;
3949 "unsupported call to variadic function ");
3957 "unsupported required tail call to function ");
3962 Outs, OutVals, Ins, DAG);
3966 "site marked musttail or on llvm.amdgcn.cs.chain");
3973 if (!TailCallOpt && IsTailCall)
4020 if (!IsSibCall || IsChainCallConv) {
4027 RegsToPass.emplace_back(IsChainCallConv
4028 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4029 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4036 const unsigned NumSpecialInputs = RegsToPass.size();
4038 MVT PtrVT = MVT::i32;
4041 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4069 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4077 int32_t
Offset = LocMemOffset;
4084 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4090 ? Flags.getNonZeroByValAlign()
4117 if (Outs[i].Flags.isByVal()) {
4119 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4122 Outs[i].Flags.getNonZeroByValAlign(),
4124 nullptr, std::nullopt, DstInfo,
4130 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4136 if (!MemOpChains.
empty())
4152 unsigned ArgIdx = 0;
4153 for (
auto [Reg, Val] : RegsToPass) {
4154 if (ArgIdx++ >= NumSpecialInputs &&
4155 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4181 if (IsTailCall && !IsSibCall) {
4186 std::vector<SDValue> Ops({Chain});
4192 Ops.push_back(Callee);
4209 Ops.push_back(Callee);
4220 if (IsChainCallConv)
4225 for (
auto &[Reg, Val] : RegsToPass)
4229 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4230 assert(Mask &&
"Missing call preserved mask for calling convention");
4240 MVT::Glue, GlueOps),
4245 Ops.push_back(InGlue);
4263 return DAG.
getNode(OPC,
DL, MVT::Other, Ops);
4268 Chain = Call.getValue(0);
4269 InGlue = Call.getValue(1);
4271 uint64_t CalleePopBytes = NumBytes;
4292 EVT VT =
Op.getValueType();
4302 Align Alignment = cast<ConstantSDNode>(
Op.getOperand(2))->getAlignValue();
4306 "Stack grows upwards for AMDGPU");
4308 Chain = BaseAddr.getValue(1);
4310 if (Alignment > StackAlign) {
4313 uint64_t StackAlignMask = ScaledAlignment - 1;
4320 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4322 if (isa<ConstantSDNode>(
Size)) {
4353 if (
Op.getValueType() != MVT::i32)
4372 assert(
Op.getValueType() == MVT::i32);
4381 Op.getOperand(0), IntrinID, GetRoundBothImm);
4415 SDValue RoundModeTimesNumBits =
4435 TableEntry, EnumOffset);
4449 if (
auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4451 static_cast<uint32_t>(ConstMode->getZExtValue()),
4463 if (UseReducedTable) {
4469 SDValue RoundModeTimesNumBits =
4489 SDValue RoundModeTimesNumBits =
4498 NewMode = TruncTable;
4507 ReadFirstLaneID, NewMode);
4520 IntrinID, RoundBothImm, NewMode);
4526 if (
Op->isDivergent() &&
4554 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4555 EVT SrcVT = Src.getValueType();
4564 EVT DstVT =
Op.getValueType();
4573 if (
Op.getValueType() != MVT::i64)
4587 Op.getOperand(0), IntrinID, ModeHwRegImm);
4589 Op.getOperand(0), IntrinID, TrapHwRegImm);
4603 if (
Op.getOperand(1).getValueType() != MVT::i64)
4615 ReadFirstLaneID, NewModeReg);
4617 ReadFirstLaneID, NewTrapReg);
4619 unsigned ModeHwReg =
4622 unsigned TrapHwReg =
4630 IntrinID, ModeHwRegImm, NewModeReg);
4633 IntrinID, TrapHwRegImm, NewTrapReg);
4642 .
Case(
"m0", AMDGPU::M0)
4643 .
Case(
"exec", AMDGPU::EXEC)
4644 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4645 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4646 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4647 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4648 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4656 "\" for subtarget."));
4661 case AMDGPU::EXEC_LO:
4662 case AMDGPU::EXEC_HI:
4663 case AMDGPU::FLAT_SCR_LO:
4664 case AMDGPU::FLAT_SCR_HI:
4669 case AMDGPU::FLAT_SCR:
4688 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4697static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4719 auto Next = std::next(
I);
4732 return std::pair(LoopBB, RemainderBB);
4739 auto I =
MI.getIterator();
4740 auto E = std::next(
I);
4762 Src->setIsKill(
false);
4772 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
4778 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4781 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4805 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4806 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4815 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4816 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4818 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4819 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4827 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4834 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4838 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4844 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4845 : AMDGPU::S_AND_SAVEEXEC_B64),
4849 MRI.setSimpleHint(NewExec, CondReg);
4851 if (UseGPRIdxMode) {
4853 SGPRIdxReg = CurrentIdxReg;
4855 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4856 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4866 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4873 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4876 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4877 : AMDGPU::S_XOR_B64_term),
4901 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4902 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4910 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
4912 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4913 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4914 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4915 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4930 InitResultReg, DstReg, PhiReg, TmpExec,
4931 Offset, UseGPRIdxMode, SGPRIdxReg);
4937 LoopBB->removeSuccessor(RemainderBB);
4939 LoopBB->addSuccessor(LandingPad);
4950static std::pair<unsigned, int>
4954 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
4959 return std::pair(AMDGPU::sub0,
Offset);
4973 assert(
Idx->getReg() != AMDGPU::NoRegister);
4997 return Idx->getReg();
4999 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5016 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5017 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5026 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5029 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5033 if (UseGPRIdxMode) {
5040 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5053 MI.eraseFromParent();
5062 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5063 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5069 UseGPRIdxMode, SGPRIdxReg);
5073 if (UseGPRIdxMode) {
5075 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5077 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5082 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5087 MI.eraseFromParent();
5104 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5114 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5116 if (
Idx->getReg() == AMDGPU::NoRegister) {
5127 MI.eraseFromParent();
5132 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5136 if (UseGPRIdxMode) {
5140 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5149 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5150 TRI.getRegSizeInBits(*VecRC), 32,
false);
5156 MI.eraseFromParent();
5166 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5170 UseGPRIdxMode, SGPRIdxReg);
5173 if (UseGPRIdxMode) {
5175 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5177 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5183 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5184 TRI.getRegSizeInBits(*VecRC), 32,
false);
5185 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5191 MI.eraseFromParent();
5197 case AMDGPU::S_MIN_U32:
5198 return std::numeric_limits<uint32_t>::max();
5199 case AMDGPU::S_MIN_I32:
5200 return std::numeric_limits<int32_t>::max();
5201 case AMDGPU::S_MAX_U32:
5202 return std::numeric_limits<uint32_t>::min();
5203 case AMDGPU::S_MAX_I32:
5204 return std::numeric_limits<int32_t>::min();
5205 case AMDGPU::S_ADD_I32:
5206 case AMDGPU::S_SUB_I32:
5207 case AMDGPU::S_OR_B32:
5208 case AMDGPU::S_XOR_B32:
5209 return std::numeric_limits<uint32_t>::min();
5210 case AMDGPU::S_AND_B32:
5211 return std::numeric_limits<uint32_t>::max();
5228 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5233 case AMDGPU::S_MIN_U32:
5234 case AMDGPU::S_MIN_I32:
5235 case AMDGPU::S_MAX_U32:
5236 case AMDGPU::S_MAX_I32:
5237 case AMDGPU::S_AND_B32:
5238 case AMDGPU::S_OR_B32: {
5244 case AMDGPU::S_XOR_B32:
5245 case AMDGPU::S_ADD_I32:
5246 case AMDGPU::S_SUB_I32: {
5249 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5250 Register ActiveLanes =
MRI.createVirtualRegister(DstRegClass);
5252 bool IsWave32 = ST.isWave32();
5253 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5254 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5256 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5261 auto NewAccumulator =
BuildMI(BB,
MI,
DL,
TII->get(CountReg), ActiveLanes)
5262 .
addReg(Exec->getOperand(0).getReg());
5265 case AMDGPU::S_XOR_B32: {
5270 Register ParityRegister =
MRI.createVirtualRegister(DstRegClass);
5274 .
addReg(NewAccumulator->getOperand(0).getReg())
5278 .
addReg(ParityReg->getOperand(0).getReg());
5281 case AMDGPU::S_SUB_I32: {
5282 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5285 auto InvertedValReg =
5290 .
addReg(InvertedValReg->getOperand(0).getReg())
5291 .
addReg(NewAccumulator->getOperand(0).getReg());
5294 case AMDGPU::S_ADD_I32: {
5297 .
addReg(NewAccumulator->getOperand(0).getReg());
5324 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5325 Register InitalValReg =
MRI.createVirtualRegister(DstRegClass);
5327 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5328 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5329 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5331 Register FF1Reg =
MRI.createVirtualRegister(DstRegClass);
5333 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5335 bool IsWave32 = ST.isWave32();
5336 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5337 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5344 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5352 I = ComputeLoop->end();
5354 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5358 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5359 .
addReg(TmpSReg->getOperand(0).getReg())
5363 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5364 auto FF1 =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(SFFOpc), FF1Reg)
5365 .
addReg(ActiveBits->getOperand(0).getReg());
5366 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5367 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5369 .
addReg(FF1->getOperand(0).getReg());
5372 .
addReg(LaneValue->getOperand(0).getReg());
5375 unsigned BITSETOpc =
5376 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5377 auto NewActiveBits =
5378 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5379 .
addReg(FF1->getOperand(0).getReg())
5380 .
addReg(ActiveBits->getOperand(0).getReg());
5383 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5384 .addMBB(ComputeLoop);
5385 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5386 .addMBB(ComputeLoop);
5389 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5391 .
addReg(NewActiveBits->getOperand(0).getReg())
5393 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5398 MI.eraseFromParent();
5410 switch (
MI.getOpcode()) {
5411 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5413 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5415 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5417 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5419 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5421 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5423 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5425 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5427 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5429 case AMDGPU::S_UADDO_PSEUDO:
5430 case AMDGPU::S_USUBO_PSEUDO: {
5437 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5439 : AMDGPU::S_SUB_I32;
5450 MI.eraseFromParent();
5453 case AMDGPU::S_ADD_U64_PSEUDO:
5454 case AMDGPU::S_SUB_U64_PSEUDO: {
5463 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5465 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5475 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5476 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5479 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5481 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5484 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5486 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5488 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5489 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5502 MI.eraseFromParent();
5505 case AMDGPU::V_ADD_U64_PSEUDO:
5506 case AMDGPU::V_SUB_U64_PSEUDO: {
5512 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5518 if (ST.hasAddSubU64Insts()) {
5520 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5521 : AMDGPU::V_SUB_U64_e64),
5526 TII->legalizeOperands(*
I);
5527 MI.eraseFromParent();
5531 if (IsAdd && ST.hasLshlAddU64Inst()) {
5537 TII->legalizeOperands(*
Add);
5538 MI.eraseFromParent();
5542 const auto *CarryRC =
TRI->getWaveMaskRegClass();
5544 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5545 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5547 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
5548 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
5552 : &AMDGPU::VReg_64RegClass;
5555 : &AMDGPU::VReg_64RegClass;
5558 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5560 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5563 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5565 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5568 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5570 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5573 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5580 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5594 TII->legalizeOperands(*LoHalf);
5595 TII->legalizeOperands(*HiHalf);
5596 MI.eraseFromParent();
5599 case AMDGPU::S_ADD_CO_PSEUDO:
5600 case AMDGPU::S_SUB_CO_PSEUDO: {
5614 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5615 ? AMDGPU::S_ADDC_U32
5616 : AMDGPU::S_SUBB_U32;
5618 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5619 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5624 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5625 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5629 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5631 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5637 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
5638 assert(WaveSize == 64 || WaveSize == 32);
5640 if (WaveSize == 64) {
5641 if (ST.hasScalarCompareEq64()) {
5647 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5649 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5651 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5652 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5654 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
5675 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5681 MI.eraseFromParent();
5684 case AMDGPU::SI_INIT_M0: {
5687 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
5690 MI.eraseFromParent();
5693 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
5696 TII->get(AMDGPU::S_CMP_EQ_U32))
5701 case AMDGPU::GET_GROUPSTATICSIZE: {
5706 .
add(
MI.getOperand(0))
5708 MI.eraseFromParent();
5711 case AMDGPU::GET_SHADERCYCLESHILO: {
5725 using namespace AMDGPU::Hwreg;
5726 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5728 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5729 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5731 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5732 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5734 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5738 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5743 .
add(
MI.getOperand(0))
5748 MI.eraseFromParent();
5751 case AMDGPU::SI_INDIRECT_SRC_V1:
5752 case AMDGPU::SI_INDIRECT_SRC_V2:
5753 case AMDGPU::SI_INDIRECT_SRC_V4:
5754 case AMDGPU::SI_INDIRECT_SRC_V8:
5755 case AMDGPU::SI_INDIRECT_SRC_V9:
5756 case AMDGPU::SI_INDIRECT_SRC_V10:
5757 case AMDGPU::SI_INDIRECT_SRC_V11:
5758 case AMDGPU::SI_INDIRECT_SRC_V12:
5759 case AMDGPU::SI_INDIRECT_SRC_V16:
5760 case AMDGPU::SI_INDIRECT_SRC_V32:
5762 case AMDGPU::SI_INDIRECT_DST_V1:
5763 case AMDGPU::SI_INDIRECT_DST_V2:
5764 case AMDGPU::SI_INDIRECT_DST_V4:
5765 case AMDGPU::SI_INDIRECT_DST_V8:
5766 case AMDGPU::SI_INDIRECT_DST_V9:
5767 case AMDGPU::SI_INDIRECT_DST_V10:
5768 case AMDGPU::SI_INDIRECT_DST_V11:
5769 case AMDGPU::SI_INDIRECT_DST_V12:
5770 case AMDGPU::SI_INDIRECT_DST_V16:
5771 case AMDGPU::SI_INDIRECT_DST_V32:
5773 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5774 case AMDGPU::SI_KILL_I1_PSEUDO:
5776 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5785 Register SrcCond =
MI.getOperand(3).getReg();
5787 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5788 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5789 const auto *CondRC =
TRI->getWaveMaskRegClass();
5790 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
5794 : &AMDGPU::VReg_64RegClass;
5797 : &AMDGPU::VReg_64RegClass;
5800 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5802 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5805 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5807 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5810 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5812 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5833 MI.eraseFromParent();
5836 case AMDGPU::SI_BR_UNDEF: {
5840 .
add(
MI.getOperand(0));
5842 MI.eraseFromParent();
5845 case AMDGPU::ADJCALLSTACKUP:
5846 case AMDGPU::ADJCALLSTACKDOWN: {
5853 case AMDGPU::SI_CALL_ISEL: {
5857 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
5860 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5866 MI.eraseFromParent();
5869 case AMDGPU::V_ADD_CO_U32_e32:
5870 case AMDGPU::V_SUB_CO_U32_e32:
5871 case AMDGPU::V_SUBREV_CO_U32_e32: {
5874 unsigned Opc =
MI.getOpcode();
5876 bool NeedClampOperand =
false;
5877 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
5879 NeedClampOperand =
true;
5883 if (
TII->isVOP3(*
I)) {
5888 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
5889 if (NeedClampOperand)
5892 TII->legalizeOperands(*
I);
5894 MI.eraseFromParent();
5897 case AMDGPU::V_ADDC_U32_e32:
5898 case AMDGPU::V_SUBB_U32_e32:
5899 case AMDGPU::V_SUBBREV_U32_e32:
5902 TII->legalizeOperands(
MI);
5904 case AMDGPU::DS_GWS_INIT:
5905 case AMDGPU::DS_GWS_SEMA_BR:
5906 case AMDGPU::DS_GWS_BARRIER:
5907 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
5909 case AMDGPU::DS_GWS_SEMA_V:
5910 case AMDGPU::DS_GWS_SEMA_P:
5911 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5919 case AMDGPU::S_SETREG_B32: {
5934 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5935 const unsigned SetMask = WidthMask <<
Offset;
5938 unsigned SetDenormOp = 0;
5939 unsigned SetRoundOp = 0;
5947 SetRoundOp = AMDGPU::S_ROUND_MODE;
5948 SetDenormOp = AMDGPU::S_DENORM_MODE;
5950 SetRoundOp = AMDGPU::S_ROUND_MODE;
5952 SetDenormOp = AMDGPU::S_DENORM_MODE;
5955 if (SetRoundOp || SetDenormOp) {
5958 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5959 unsigned ImmVal = Def->getOperand(1).getImm();
5973 MI.eraseFromParent();
5982 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
5986 case AMDGPU::S_INVERSE_BALLOT_U32:
5987 case AMDGPU::S_INVERSE_BALLOT_U64:
5990 MI.setDesc(
TII->get(AMDGPU::COPY));
5992 case AMDGPU::ENDPGM_TRAP: {
5995 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6015 MI.eraseFromParent();
6018 case AMDGPU::SIMULATED_TRAP: {
6022 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6023 MI.eraseFromParent();
6026 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6032 Register OriginalExec = Setup->getOperand(0).getReg();
6033 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6035 MI.getOperand(0).setReg(OriginalExec);
6072 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6160 EVT VT =
N->getValueType(0);
6164 if (VT == MVT::f16) {
6180 unsigned Opc =
Op.getOpcode();
6181 EVT VT =
Op.getValueType();
6182 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6183 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6184 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6185 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6200 unsigned Opc =
Op.getOpcode();
6201 EVT VT =
Op.getValueType();
6202 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6203 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6204 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6205 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6206 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6207 VT == MVT::v32bf16);
6215 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6217 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6224 unsigned Opc =
Op.getOpcode();
6225 EVT VT =
Op.getValueType();
6226 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6227 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6228 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6229 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6230 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6231 VT == MVT::v32bf16);
6236 : std::pair(Op0, Op0);
6245 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
6247 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
6253 switch (
Op.getOpcode()) {
6257 return LowerBRCOND(
Op, DAG);
6259 return LowerRETURNADDR(
Op, DAG);
6262 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6263 "Load should return a value and a chain");
6267 EVT VT =
Op.getValueType();
6269 return lowerFSQRTF32(
Op, DAG);
6271 return lowerFSQRTF64(
Op, DAG);
6276 return LowerTrig(
Op, DAG);
6278 return LowerSELECT(
Op, DAG);
6280 return LowerFDIV(
Op, DAG);
6282 return LowerFFREXP(
Op, DAG);
6284 return LowerATOMIC_CMP_SWAP(
Op, DAG);
6286 return LowerSTORE(
Op, DAG);
6290 return LowerGlobalAddress(MFI,
Op, DAG);
6293 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
6295 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
6297 return LowerINTRINSIC_VOID(
Op, DAG);
6299 return lowerADDRSPACECAST(
Op, DAG);
6301 return lowerINSERT_SUBVECTOR(
Op, DAG);
6303 return lowerINSERT_VECTOR_ELT(
Op, DAG);
6305 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
6307 return lowerVECTOR_SHUFFLE(
Op, DAG);
6309 return lowerSCALAR_TO_VECTOR(
Op, DAG);
6311 return lowerBUILD_VECTOR(
Op, DAG);
6314 return lowerFP_ROUND(
Op, DAG);
6316 return lowerTRAP(
Op, DAG);
6318 return lowerDEBUGTRAP(
Op, DAG);
6327 return lowerFMINNUM_FMAXNUM(
Op, DAG);
6330 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
6333 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
6336 return lowerFLDEXP(
Op, DAG);
6361 return lowerFCOPYSIGN(
Op, DAG);
6363 return lowerMUL(
Op, DAG);
6366 return lowerXMULO(
Op, DAG);
6369 return lowerXMUL_LOHI(
Op, DAG);
6402 EVT FittingLoadVT = LoadVT;
6434SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6437 bool IsIntrinsic)
const {
6441 EVT LoadVT =
M->getValueType(0);
6443 EVT EquivLoadVT = LoadVT;
6461 M->getMemoryVT(),
M->getMemOperand());
6472 EVT LoadVT =
M->getValueType(0);
6478 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
6479 bool IsTFE =
M->getNumValues() == 3;
6492 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
M->getMemOperand(),
6496 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(), Ops, IntVT,
6497 M->getMemOperand(), DAG);
6502 SDValue MemNode = getMemIntrinsicNode(
Opc,
DL, VTList, Ops, CastVT,
6503 M->getMemOperand(), DAG);
6511 EVT VT =
N->getValueType(0);
6512 unsigned CondCode =
N->getConstantOperandVal(3);
6523 EVT CmpVT =
LHS.getValueType();
6524 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6525 unsigned PromoteOp =
6545 EVT VT =
N->getValueType(0);
6547 unsigned CondCode =
N->getConstantOperandVal(3);
6556 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
6574 EVT VT =
N->getValueType(0);
6581 Src.getOperand(1), Src.getOperand(2));
6592 Exec = AMDGPU::EXEC_LO;
6594 Exec = AMDGPU::EXEC;
6611 EVT VT =
N->getValueType(0);
6613 unsigned IID =
N->getConstantOperandVal(0);
6614 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6615 IID == Intrinsic::amdgcn_permlanex16;
6616 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6617 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6621 unsigned SplitSize = 32;
6622 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6623 ST->hasDPALU_DPP() &&
6631 case Intrinsic::amdgcn_permlane16:
6632 case Intrinsic::amdgcn_permlanex16:
6633 case Intrinsic::amdgcn_update_dpp:
6638 case Intrinsic::amdgcn_writelane:
6641 case Intrinsic::amdgcn_readlane:
6642 case Intrinsic::amdgcn_set_inactive:
6643 case Intrinsic::amdgcn_set_inactive_chain_arg:
6644 case Intrinsic::amdgcn_mov_dpp8:
6647 case Intrinsic::amdgcn_readfirstlane:
6648 case Intrinsic::amdgcn_permlane64:
6658 if (
SDNode *GL =
N->getGluedNode()) {
6660 GL = GL->getOperand(0).getNode();
6670 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6671 IID == Intrinsic::amdgcn_mov_dpp8 ||
6672 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6673 Src1 =
N->getOperand(2);
6674 if (IID == Intrinsic::amdgcn_writelane ||
6675 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6676 Src2 =
N->getOperand(3);
6679 if (ValSize == SplitSize) {
6689 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6694 if (IID == Intrinsic::amdgcn_writelane) {
6699 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6701 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
6704 if (ValSize % SplitSize != 0)
6708 EVT VT =
N->getValueType(0);
6712 unsigned NumOperands =
N->getNumOperands();
6714 SDNode *GL =
N->getGluedNode();
6719 for (
unsigned i = 0; i != NE; ++i) {
6720 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6722 SDValue Operand =
N->getOperand(j);
6752 if (SplitSize == 32) {
6754 return unrollLaneOp(LaneOp.
getNode());
6760 unsigned SubVecNumElt =
6764 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6765 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6769 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6774 if (IID == Intrinsic::amdgcn_writelane)
6779 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6780 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6781 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6782 EltIdx += SubVecNumElt;
6796 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6799 if (IID == Intrinsic::amdgcn_writelane)
6802 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6810 switch (
N->getOpcode()) {
6822 unsigned IID =
N->getConstantOperandVal(0);
6824 case Intrinsic::amdgcn_make_buffer_rsrc:
6825 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
6827 case Intrinsic::amdgcn_cvt_pkrtz: {
6836 case Intrinsic::amdgcn_cvt_pknorm_i16:
6837 case Intrinsic::amdgcn_cvt_pknorm_u16:
6838 case Intrinsic::amdgcn_cvt_pk_i16:
6839 case Intrinsic::amdgcn_cvt_pk_u16: {
6845 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6847 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6849 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6854 EVT VT =
N->getValueType(0);
6863 case Intrinsic::amdgcn_s_buffer_load: {
6875 EVT VT =
Op.getValueType();
6876 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
6888 if (!
Offset->isDivergent()) {
6907 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
6912 case Intrinsic::amdgcn_dead: {
6913 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
6924 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
6925 Results.push_back(Res.getOperand(
I));
6929 Results.push_back(Res.getValue(1));
6938 EVT VT =
N->getValueType(0);
6943 EVT SelectVT = NewVT;
6944 if (NewVT.
bitsLT(MVT::i32)) {
6947 SelectVT = MVT::i32;
6953 if (NewVT != SelectVT)
6959 if (
N->getValueType(0) != MVT::v2f16)
6971 if (
N->getValueType(0) != MVT::v2f16)
6983 if (
N->getValueType(0) != MVT::f16)
6998 if (U.get() !=
Value)
7001 if (U.getUser()->getOpcode() == Opcode)
7007unsigned SITargetLowering::isCFIntrinsic(
const SDNode *
Intr)
const {
7009 switch (
Intr->getConstantOperandVal(1)) {
7010 case Intrinsic::amdgcn_if:
7012 case Intrinsic::amdgcn_else:
7014 case Intrinsic::amdgcn_loop:
7016 case Intrinsic::amdgcn_end_cf:
7063 SDNode *
Intr = BRCOND.getOperand(1).getNode();
7076 assert(BR &&
"brcond missing unconditional branch user");
7077 Target = BR->getOperand(1);
7080 unsigned CFNode = isCFIntrinsic(
Intr);
7099 Ops.
append(
Intr->op_begin() + (HaveChain ? 2 : 1),
Intr->op_end());
7123 for (
unsigned i = 1, e =
Intr->getNumValues() - 1; i != e; ++i) {
7136 Intr->getOperand(0));
7142 MVT VT =
Op.getSimpleValueType();
7145 if (
Op.getConstantOperandVal(0) != 0)
7151 if (
Info->isEntryFunction())
7168 return Op.getValueType().bitsLE(VT)
7176 EVT DstVT =
Op.getValueType();
7183 unsigned Opc =
Op.getOpcode();
7195 EVT SrcVT = Src.getValueType();
7196 EVT DstVT =
Op.getValueType();
7202 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
7209 if (DstVT == MVT::f16) {
7219 if (
Op->getFlags().hasApproximateFuncs()) {
7230 "custom lower FP_ROUND for f16 or bf16");
7244 EVT VT =
Op.getValueType();
7247 bool IsIEEEMode =
Info->getMode().IEEE;
7256 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7263SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
7265 EVT VT =
Op.getValueType();
7268 bool IsIEEEMode =
Info->getMode().IEEE;
7273 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7281 EVT VT =
Op.getValueType();
7288 "should not need to widen f16 minimum/maximum to v2f16");
7302 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7310 EVT VT =
Op.getValueType();
7314 EVT ExpVT =
Exp.getValueType();
7315 if (ExpVT == MVT::i16)
7336 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
7343 switch (
Op->getOpcode()) {
7373 DAGCombinerInfo &DCI)
const {
7374 const unsigned Opc =
Op.getOpcode();
7382 :
Op->getOperand(0).getValueType();
7385 if (DCI.isBeforeLegalizeOps() ||
7389 auto &DAG = DCI.DAG;
7395 LHS =
Op->getOperand(1);
7396 RHS =
Op->getOperand(2);
7398 LHS =
Op->getOperand(0);
7399 RHS =
Op->getOperand(1);
7414 return DAG.
getSetCC(
DL,
Op.getValueType(), LHS, RHS, CC);
7438 if (MagVT == SignVT)
7455 EVT VT =
Op.getValueType();
7461 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
7488 if (
Op->isDivergent())
7501 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7503 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7506 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7508 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7514 EVT VT =
Op.getValueType();
7521 const APInt &
C = RHSC->getAPIntValue();
7523 if (
C.isPowerOf2()) {
7525 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
7552 if (
Op->isDivergent()) {
7569 return lowerTrapEndpgm(
Op, DAG);
7572 : lowerTrapHsaQueuePtr(
Op, DAG);
7582SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
7584 ImplicitParameter Param)
const {
7604 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
7610 if (UserSGPR == AMDGPU::NoRegister) {
7653 "debugtrap handler not supported",
7664SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
7668 ? AMDGPU::SRC_SHARED_BASE
7669 : AMDGPU::SRC_PRIVATE_BASE;
7670 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
7672 "Cannot use src_private_base with globally addressable scratch!");
7695 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7704 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
7710 if (UserSGPR == AMDGPU::NoRegister) {
7740 if (isa<FrameIndexSDNode, GlobalAddressSDNode, BasicBlockSDNode>(Val))
7743 if (
auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7744 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7758 unsigned DestAS, SrcAS;
7760 bool IsNonNull =
false;
7761 if (
const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(
Op)) {
7762 SrcAS = ASC->getSrcAddressSpace();
7763 Src = ASC->getOperand(0);
7764 DestAS = ASC->getDestAddressSpace();
7767 Op.getConstantOperandVal(0) ==
7768 Intrinsic::amdgcn_addrspacecast_nonnull);
7769 Src =
Op->getOperand(1);
7770 SrcAS =
Op->getConstantOperandVal(2);
7771 DestAS =
Op->getConstantOperandVal(3);
7789 AMDGPU::S_MOV_B32, SL, MVT::i32,
7790 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
7798 unsigned NullVal =
TM.getNullPointerValue(DestAS);
7836 AMDGPU::S_MOV_B64, SL, MVT::i64,
7837 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
7839 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
7841 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7849 unsigned NullVal =
TM.getNullPointerValue(SrcAS);
7861 Op.getValueType() == MVT::i64) {
7870 Src.getValueType() == MVT::i64)
7890 EVT InsVT =
Ins.getValueType();
7893 unsigned IdxVal =
Idx->getAsZExtVal();
7898 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
7903 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7905 MVT::i32, InsNumElts / 2);
7910 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
7912 if (InsNumElts == 2) {
7925 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
7947 auto *KIdx = dyn_cast<ConstantSDNode>(
Idx);
7948 if (NumElts == 4 && EltSize == 16 && KIdx) {
7959 unsigned Idx = KIdx->getZExtValue();
7960 bool InsertLo =
Idx < 2;
7977 if (isa<ConstantSDNode>(
Idx))
7983 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
7989 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8018 EVT ResultVT =
Op.getValueType();
8031 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8034 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8038 if (VecSize == 128) {
8046 }
else if (VecSize == 256) {
8049 for (
unsigned P = 0;
P < 4; ++
P) {
8055 Parts[0], Parts[1]));
8057 Parts[2], Parts[3]));
8063 for (
unsigned P = 0;
P < 8; ++
P) {
8070 Parts[0], Parts[1], Parts[2], Parts[3]));
8073 Parts[4], Parts[5], Parts[6], Parts[7]));
8076 EVT IdxVT =
Idx.getValueType();
8093 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8108 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8118 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8123 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8124 !(Mask[Elt + 1] & 1);
8130 EVT ResultVT =
Op.getValueType();
8133 const int NewSrcNumElts = 2;
8135 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
8151 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
8173 if (ShouldUseConsecutiveExtract &&
8176 int VecIdx =
Idx < SrcNumElts ? 0 : 1;
8177 int EltIdx =
Idx < SrcNumElts ?
Idx :
Idx - SrcNumElts;
8189 if (Idx0 >= SrcNumElts) {
8194 if (Idx1 >= SrcNumElts) {
8199 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8200 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8208 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8209 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8214 if (SubVec0 != SubVec1) {
8215 NewMaskIdx1 += NewSrcNumElts;
8222 {NewMaskIdx0, NewMaskIdx1});
8227 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8228 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8229 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8230 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8249 EVT ResultVT =
Op.getValueType();
8265 EVT VT =
Op.getValueType();
8267 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8302 for (
unsigned P = 0;
P < NumParts; ++
P) {
8304 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
8337 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
8383 EVT PtrVT =
Op.getValueType();
8399 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
8477 SDValue Param = lowerKernargMemParameter(
8488 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
8496 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
8504 unsigned NumElts = Elts.
size();
8506 if (NumElts <= 12) {
8515 for (
unsigned i = 0; i < Elts.
size(); ++i) {
8521 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
8531 EVT SrcVT = Src.getValueType();
8552 bool Unpacked,
bool IsD16,
int DMaskPop,
8553 int NumVDataDwords,
bool IsAtomicPacked16Bit,
8557 EVT ReqRetVT = ResultTypes[0];
8559 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
8560 ? (ReqRetNumElts + 1) / 2
8563 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
8574 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
8585 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
8587 NumDataDwords - MaskPopDwords);
8592 EVT LegalReqRetVT = ReqRetVT;
8594 if (!
Data.getValueType().isInteger())
8596 Data.getValueType().changeTypeToInteger(),
Data);
8617 if (Result->getNumValues() == 1)
8624 SDValue *LWE,
bool &IsTexFail) {
8625 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.
getNode());
8644 unsigned DimIdx,
unsigned EndIdx,
8645 unsigned NumGradients) {
8647 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
8655 if (((
I + 1) >= EndIdx) ||
8656 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
8657 I == DimIdx + NumGradients - 1))) {
8658 if (
Addr.getValueType() != MVT::i16)
8679 unsigned IntrOpcode =
Intr->BaseOpcode;
8690 int NumVDataDwords = 0;
8691 bool AdjustRetType =
false;
8692 bool IsAtomicPacked16Bit =
false;
8695 const unsigned ArgOffset = WithChain ? 2 : 1;
8698 unsigned DMaskLanes = 0;
8700 if (BaseOpcode->Atomic) {
8701 VData =
Op.getOperand(2);
8703 IsAtomicPacked16Bit =
8704 (
Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8705 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8708 if (BaseOpcode->AtomicX2) {
8715 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8716 DMask = Is64Bit ? 0xf : 0x3;
8717 NumVDataDwords = Is64Bit ? 4 : 2;
8719 DMask = Is64Bit ? 0x3 : 0x1;
8720 NumVDataDwords = Is64Bit ? 2 : 1;
8723 DMask =
Op->getConstantOperandVal(ArgOffset +
Intr->DMaskIndex);
8726 if (BaseOpcode->Store) {
8727 VData =
Op.getOperand(2);
8735 VData = handleD16VData(VData, DAG,
true);
8739 }
else if (!BaseOpcode->NoReturn) {
8752 (!LoadVT.
isVector() && DMaskLanes > 1))
8760 NumVDataDwords = (DMaskLanes + 1) / 2;
8762 NumVDataDwords = DMaskLanes;
8764 AdjustRetType =
true;
8768 unsigned VAddrEnd = ArgOffset +
Intr->VAddrEnd;
8773 Op.getOperand(ArgOffset +
Intr->GradientStart).getSimpleValueType();
8775 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8776 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8778 VAddrVT =
Op.getOperand(ArgOffset +
Intr->CoordStart).getSimpleValueType();
8780 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8781 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8784 for (
unsigned I =
Intr->VAddrStart; I < Intr->GradientStart;
I++) {
8785 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
8786 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
8791 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
8795 "Bias needs to be converted to 16 bit in A16 mode");
8800 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
8804 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
8805 "require 16 bit args for both gradients and addresses");
8810 if (!
ST->hasA16()) {
8811 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
8812 "support 16 bit addresses\n");
8822 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
8826 IntrOpcode = G16MappingInfo->
G16;
8834 ArgOffset +
Intr->GradientStart,
8835 ArgOffset +
Intr->CoordStart,
Intr->NumGradients);
8837 for (
unsigned I = ArgOffset +
Intr->GradientStart;
8838 I < ArgOffset + Intr->CoordStart;
I++)
8845 ArgOffset +
Intr->CoordStart, VAddrEnd,
8849 for (
unsigned I = ArgOffset +
Intr->CoordStart;
I < VAddrEnd;
I++)
8867 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
8868 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
8869 const bool UseNSA =
ST->hasNSAEncoding() &&
8870 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
8871 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
8872 const bool UsePartialNSA =
8873 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
8876 if (UsePartialNSA) {
8878 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8879 }
else if (!UseNSA) {
8886 if (!BaseOpcode->Sampler) {
8890 Op.getConstantOperandVal(ArgOffset +
Intr->UnormIndex);
8892 Unorm = UnormConst ? True : False;
8897 SDValue TexFail =
Op.getOperand(ArgOffset +
Intr->TexFailCtrlIndex);
8898 bool IsTexFail =
false;
8899 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8910 NumVDataDwords += 1;
8911 AdjustRetType =
true;
8916 if (AdjustRetType) {
8919 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8922 if (isa<MemSDNode>(
Op))
8928 MVT::i32, NumVDataDwords)
8931 ResultTypes[0] = NewVT;
8932 if (ResultTypes.size() == 3) {
8936 ResultTypes.erase(&ResultTypes[1]);
8940 unsigned CPol =
Op.getConstantOperandVal(ArgOffset +
Intr->CachePolicyIndex);
8941 if (BaseOpcode->Atomic)
8948 if (BaseOpcode->Store || BaseOpcode->Atomic)
8950 if (UsePartialNSA) {
8959 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8962 if (BaseOpcode->Sampler) {
8971 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8975 ST->hasFeature(AMDGPU::FeatureR128A16)
8986 "TFE is not supported on this GPU",
DL.getDebugLoc()));
8989 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8993 if (BaseOpcode->HasD16)
8995 if (isa<MemSDNode>(
Op))
8998 int NumVAddrDwords =
9004 NumVDataDwords, NumVAddrDwords);
9005 }
else if (IsGFX11Plus) {
9007 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9008 : AMDGPU::MIMGEncGfx11Default,
9009 NumVDataDwords, NumVAddrDwords);
9010 }
else if (IsGFX10Plus) {
9012 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9013 : AMDGPU::MIMGEncGfx10Default,
9014 NumVDataDwords, NumVAddrDwords);
9018 NumVDataDwords, NumVAddrDwords);
9022 "requested image instruction is not supported on this GPU",
9027 for (
EVT VT : OrigResultTypes) {
9028 if (VT == MVT::Other)
9029 RetValues[
Idx++] =
Op.getOperand(0);
9040 NumVDataDwords, NumVAddrDwords);
9043 NumVDataDwords, NumVAddrDwords);
9049 if (
auto *
MemOp = dyn_cast<MemSDNode>(
Op)) {
9054 if (BaseOpcode->AtomicX2) {
9059 if (BaseOpcode->NoReturn)
9063 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9081 if (!
Offset->isDivergent()) {
9126 return handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
9130 unsigned NumLoads = 1;
9136 if (NumElts == 8 || NumElts == 16) {
9137 NumLoads = NumElts / 4;
9145 setBufferOffsets(
Offset, DAG, &Ops[3],
9146 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
9149 for (
unsigned i = 0; i < NumLoads; ++i) {
9155 if (NumElts == 8 || NumElts == 16)
9207 EVT VT =
Op.getValueType();
9209 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
9213 switch (IntrinsicID) {
9214 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9217 return getPreloadedValue(DAG, *MFI, VT,
9220 case Intrinsic::amdgcn_dispatch_ptr:
9221 case Intrinsic::amdgcn_queue_ptr: {
9224 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
9229 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9232 return getPreloadedValue(DAG, *MFI, VT, RegID);
9234 case Intrinsic::amdgcn_implicitarg_ptr: {
9236 return getImplicitArgPtr(DAG,
DL);
9237 return getPreloadedValue(DAG, *MFI, VT,
9240 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9246 return getPreloadedValue(DAG, *MFI, VT,
9249 case Intrinsic::amdgcn_dispatch_id: {
9252 case Intrinsic::amdgcn_rcp:
9254 case Intrinsic::amdgcn_rsq:
9256 case Intrinsic::amdgcn_rsq_legacy:
9260 case Intrinsic::amdgcn_rcp_legacy:
9264 case Intrinsic::amdgcn_rsq_clamp: {
9278 case Intrinsic::r600_read_ngroups_x:
9282 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9285 case Intrinsic::r600_read_ngroups_y:
9289 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9292 case Intrinsic::r600_read_ngroups_z:
9296 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9299 case Intrinsic::r600_read_local_size_x:
9303 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9305 case Intrinsic::r600_read_local_size_y:
9309 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9311 case Intrinsic::r600_read_local_size_z:
9315 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9317 case Intrinsic::amdgcn_workgroup_id_x:
9318 return getPreloadedValue(DAG, *MFI, VT,
9320 case Intrinsic::amdgcn_workgroup_id_y:
9321 return getPreloadedValue(DAG, *MFI, VT,
9323 case Intrinsic::amdgcn_workgroup_id_z:
9324 return getPreloadedValue(DAG, *MFI, VT,
9326 case Intrinsic::amdgcn_wave_id:
9327 return lowerWaveID(DAG,
Op);
9328 case Intrinsic::amdgcn_lds_kernel_id: {
9330 return getLDSKernelId(DAG,
DL);
9331 return getPreloadedValue(DAG, *MFI, VT,
9334 case Intrinsic::amdgcn_workitem_id_x:
9335 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
9336 case Intrinsic::amdgcn_workitem_id_y:
9337 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
9338 case Intrinsic::amdgcn_workitem_id_z:
9339 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
9340 case Intrinsic::amdgcn_wavefrontsize:
9343 case Intrinsic::amdgcn_s_buffer_load: {
9344 unsigned CPol =
Op.getConstantOperandVal(3);
9351 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
9352 Op.getOperand(3), DAG);
9354 case Intrinsic::amdgcn_fdiv_fast:
9355 return lowerFDIV_FAST(
Op, DAG);
9356 case Intrinsic::amdgcn_sin:
9359 case Intrinsic::amdgcn_cos:
9362 case Intrinsic::amdgcn_mul_u24:
9365 case Intrinsic::amdgcn_mul_i24:
9369 case Intrinsic::amdgcn_log_clamp: {
9375 case Intrinsic::amdgcn_fract:
9378 case Intrinsic::amdgcn_class:
9381 case Intrinsic::amdgcn_div_fmas:
9383 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9385 case Intrinsic::amdgcn_div_fixup:
9387 Op.getOperand(2),
Op.getOperand(3));
9389 case Intrinsic::amdgcn_div_scale: {
9402 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
9405 Denominator, Numerator);
9407 case Intrinsic::amdgcn_icmp: {
9409 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
9410 Op.getConstantOperandVal(2) == 0 &&
9415 case Intrinsic::amdgcn_fcmp: {
9418 case Intrinsic::amdgcn_ballot:
9420 case Intrinsic::amdgcn_fmed3:
9422 Op.getOperand(2),
Op.getOperand(3));
9423 case Intrinsic::amdgcn_fdot2:
9425 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9426 case Intrinsic::amdgcn_fmul_legacy:
9429 case Intrinsic::amdgcn_sffbh:
9431 case Intrinsic::amdgcn_sbfe:
9433 Op.getOperand(2),
Op.getOperand(3));
9434 case Intrinsic::amdgcn_ubfe:
9436 Op.getOperand(2),
Op.getOperand(3));
9437 case Intrinsic::amdgcn_cvt_pkrtz:
9438 case Intrinsic::amdgcn_cvt_pknorm_i16:
9439 case Intrinsic::amdgcn_cvt_pknorm_u16:
9440 case Intrinsic::amdgcn_cvt_pk_i16:
9441 case Intrinsic::amdgcn_cvt_pk_u16: {
9443 EVT VT =
Op.getValueType();
9446 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9448 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9450 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9452 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
9458 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
9461 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
9464 case Intrinsic::amdgcn_fmad_ftz:
9466 Op.getOperand(2),
Op.getOperand(3));
9468 case Intrinsic::amdgcn_if_break:
9470 Op->getOperand(1),
Op->getOperand(2)),
9473 case Intrinsic::amdgcn_groupstaticsize: {
9485 case Intrinsic::amdgcn_is_shared:
9486 case Intrinsic::amdgcn_is_private: {
9493 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9500 AMDGPU::S_MOV_B32,
DL, MVT::i32,
9501 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
9510 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
9513 case Intrinsic::amdgcn_perm:
9515 Op.getOperand(2),
Op.getOperand(3));
9516 case Intrinsic::amdgcn_reloc_constant: {
9520 auto *RelocSymbol = cast<GlobalVariable>(
9526 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
9527 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
9528 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
9529 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
9530 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
9531 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
9532 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
9533 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
9534 if (
Op.getOperand(4).getValueType() == MVT::i32)
9540 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
9541 Op.getOperand(3), IndexKeyi32);
9543 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
9544 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
9545 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
9546 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
9547 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
9548 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
9549 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
9550 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
9551 if (
Op.getOperand(4).getValueType() == MVT::i64)
9557 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9558 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
9561 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
9562 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
9563 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
9564 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
9565 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
9566 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
9567 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
9570 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
9576 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9577 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9578 IndexKey, Op.getOperand(7),
9581 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
9582 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
9583 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
9584 if (
Op.getOperand(6).getValueType() == MVT::i32)
9590 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9591 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9592 IndexKeyi32, Op.getOperand(7)});
9594 case Intrinsic::amdgcn_addrspacecast_nonnull:
9595 return lowerADDRSPACECAST(
Op, DAG);
9596 case Intrinsic::amdgcn_readlane:
9597 case Intrinsic::amdgcn_readfirstlane:
9598 case Intrinsic::amdgcn_writelane:
9599 case Intrinsic::amdgcn_permlane16:
9600 case Intrinsic::amdgcn_permlanex16:
9601 case Intrinsic::amdgcn_permlane64:
9602 case Intrinsic::amdgcn_set_inactive:
9603 case Intrinsic::amdgcn_set_inactive_chain_arg:
9604 case Intrinsic::amdgcn_mov_dpp8:
9605 case Intrinsic::amdgcn_update_dpp:
9607 case Intrinsic::amdgcn_dead: {
9609 for (
const EVT ValTy :
Op.getNode()->values())
9616 return lowerImage(
Op, ImageDimIntr, DAG,
false);
9627 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
9633 unsigned NewOpcode)
const {
9637 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9638 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9652 auto *
M = cast<MemSDNode>(
Op);
9656 M->getMemOperand());
9661 unsigned NewOpcode)
const {
9665 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9666 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9680 auto *
M = cast<MemSDNode>(
Op);
9684 M->getMemOperand());
9689 unsigned IntrID =
Op.getConstantOperandVal(1);
9693 case Intrinsic::amdgcn_ds_ordered_add:
9694 case Intrinsic::amdgcn_ds_ordered_swap: {
9699 unsigned IndexOperand =
M->getConstantOperandVal(7);
9700 unsigned WaveRelease =
M->getConstantOperandVal(8);
9701 unsigned WaveDone =
M->getConstantOperandVal(9);
9703 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9704 IndexOperand &= ~0x3f;
9705 unsigned CountDw = 0;
9708 CountDw = (IndexOperand >> 24) & 0xf;
9709 IndexOperand &= ~(0xf << 24);
9711 if (CountDw < 1 || CountDw > 4) {
9714 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
9723 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
9726 if (WaveDone && !WaveRelease) {
9730 Fn,
"ds_ordered_count: wave_done requires wave_release",
9734 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9735 unsigned ShaderType =
9737 unsigned Offset0 = OrderedCountIndex << 2;
9738 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (
Instruction << 4);
9741 Offset1 |= (CountDw - 1) << 6;
9744 Offset1 |= ShaderType << 2;
9746 unsigned Offset = Offset0 | (Offset1 << 8);
9753 M->getVTList(), Ops,
M->getMemoryVT(),
9754 M->getMemOperand());
9756 case Intrinsic::amdgcn_raw_buffer_load:
9757 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9758 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9759 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9760 case Intrinsic::amdgcn_raw_buffer_load_format:
9761 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9762 const bool IsFormat =
9763 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9764 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9766 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9767 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9780 auto *
M = cast<MemSDNode>(
Op);
9781 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9783 case Intrinsic::amdgcn_struct_buffer_load:
9784 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9785 case Intrinsic::amdgcn_struct_buffer_load_format:
9786 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9787 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9788 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9789 const bool IsFormat =
9790 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9791 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9793 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9794 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9807 return lowerIntrinsicLoad(cast<MemSDNode>(
Op), IsFormat, DAG, Ops);
9809 case Intrinsic::amdgcn_raw_tbuffer_load:
9810 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9812 EVT LoadVT =
Op.getValueType();
9813 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9814 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9833 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9836 case Intrinsic::amdgcn_struct_tbuffer_load:
9837 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9839 EVT LoadVT =
Op.getValueType();
9840 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9841 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9860 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9863 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9864 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9866 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9867 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9868 return lowerStructBufferAtomicIntrin(
Op, DAG,
9870 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9871 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9873 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9874 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9875 return lowerStructBufferAtomicIntrin(
Op, DAG,
9877 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9878 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9880 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9881 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9882 return lowerStructBufferAtomicIntrin(
Op, DAG,
9884 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9885 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9887 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9888 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9890 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9891 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9893 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9894 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9896 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9899 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9900 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9902 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9903 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9905 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9906 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9908 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9909 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9911 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9914 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9915 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9917 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9918 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9920 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9921 return lowerRawBufferAtomicIntrin(
Op, DAG,
9923 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9924 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9925 return lowerStructBufferAtomicIntrin(
Op, DAG,
9927 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9928 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9930 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9931 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9933 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9934 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9935 return lowerStructBufferAtomicIntrin(
Op, DAG,
9937 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9938 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9939 return lowerStructBufferAtomicIntrin(
Op, DAG,
9941 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9942 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9943 return lowerStructBufferAtomicIntrin(
Op, DAG,
9945 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9946 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9947 return lowerStructBufferAtomicIntrin(
Op, DAG,
9949 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9950 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9952 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9953 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9955 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9956 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9958 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9959 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9961 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9962 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9964 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9965 return lowerStructBufferAtomicIntrin(
Op, DAG,
9968 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9969 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9970 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
9971 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9985 EVT VT =
Op.getValueType();
9986 auto *
M = cast<MemSDNode>(
Op);
9989 Op->getVTList(), Ops, VT,
9990 M->getMemOperand());
9992 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9993 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9994 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
9995 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
10009 EVT VT =
Op.getValueType();
10010 auto *
M = cast<MemSDNode>(
Op);
10013 Op->getVTList(), Ops, VT,
10014 M->getMemOperand());
10016 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10017 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10019 SDValue NodePtr =
M->getOperand(2);
10020 SDValue RayExtent =
M->getOperand(3);
10021 SDValue InstanceMask =
M->getOperand(4);
10022 SDValue RayOrigin =
M->getOperand(5);
10023 SDValue RayDir =
M->getOperand(6);
10025 SDValue TDescr =
M->getOperand(8);
10035 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10036 const unsigned NumVDataDwords = 10;
10037 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10039 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10040 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10041 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10048 {DAG.getBitcast(MVT::i32, RayExtent),
10049 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10061 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10063 SDValue NodePtr =
M->getOperand(2);
10064 SDValue RayExtent =
M->getOperand(3);
10065 SDValue RayOrigin =
M->getOperand(4);
10066 SDValue RayDir =
M->getOperand(5);
10067 SDValue RayInvDir =
M->getOperand(6);
10068 SDValue TDescr =
M->getOperand(7);
10085 const unsigned NumVDataDwords = 4;
10086 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10087 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10091 const unsigned BaseOpcodes[2][2] = {
10092 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10093 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10094 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10098 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10099 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10100 : AMDGPU::MIMGEncGfx10NSA,
10101 NumVDataDwords, NumVAddrDwords);
10105 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10106 : AMDGPU::MIMGEncGfx10Default,
10107 NumVDataDwords, NumVAddrDwords);
10113 auto packLanes = [&DAG, &Ops, &
DL](
SDValue Op,
bool IsAligned) {
10116 if (Lanes[0].getValueSizeInBits() == 32) {
10117 for (
unsigned I = 0;
I < 3; ++
I)
10136 if (UseNSA && IsGFX11Plus) {
10144 for (
unsigned I = 0;
I < 3; ++
I) {
10147 {DirLanes[I], InvDirLanes[I]})));
10162 packLanes(RayOrigin,
true);
10163 packLanes(RayDir,
true);
10164 packLanes(RayInvDir,
false);
10169 if (NumVAddrDwords > 12) {
10189 case Intrinsic::amdgcn_global_atomic_fmin_num:
10190 case Intrinsic::amdgcn_global_atomic_fmax_num:
10191 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10192 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10199 unsigned Opcode = 0;
10201 case Intrinsic::amdgcn_global_atomic_fmin_num:
10202 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10206 case Intrinsic::amdgcn_global_atomic_fmax_num:
10207 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10215 Ops,
M->getMemOperand());
10217 case Intrinsic::amdgcn_s_get_barrier_state:
10218 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10223 if (isa<ConstantSDNode>(
Op->getOperand(2))) {
10224 uint64_t BarID = cast<ConstantSDNode>(
Op->getOperand(2))->getZExtValue();
10225 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10226 BarID = (BarID >> 4) & 0x3F;
10227 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10232 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10233 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10253 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10261SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
10271 bool IsTFE = VTList.
NumVTs == 3;
10274 unsigned NumOpDWords = NumValueDWords + 1;
10279 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList, Ops,
10280 OpDWordsVT, OpDWordsMMO, DAG);
10285 NumValueDWords == 1
10295 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10301 WidenedMemVT, WidenedMMO);
10311 bool ImageStore)
const {
10346 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
10352 if ((NumElements % 2) == 1) {
10354 unsigned I = Elts.
size() / 2;
10370 if (NumElements == 3) {
10391 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
10394 switch (IntrinsicID) {
10395 case Intrinsic::amdgcn_exp_compr: {
10399 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
10421 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10425 case Intrinsic::amdgcn_struct_tbuffer_store:
10426 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10430 VData = handleD16VData(VData, DAG);
10431 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10432 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10450 M->getMemoryVT(),
M->getMemOperand());
10453 case Intrinsic::amdgcn_raw_tbuffer_store:
10454 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
10458 VData = handleD16VData(VData, DAG);
10459 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10460 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10478 M->getMemoryVT(),
M->getMemOperand());
10481 case Intrinsic::amdgcn_raw_buffer_store:
10482 case Intrinsic::amdgcn_raw_ptr_buffer_store:
10483 case Intrinsic::amdgcn_raw_buffer_store_format:
10484 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
10485 const bool IsFormat =
10486 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
10487 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
10494 VData = handleD16VData(VData, DAG);
10504 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10505 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10525 return handleByteShortBufferStores(DAG, VDataVT,
DL, Ops, M);
10528 M->getMemoryVT(),
M->getMemOperand());
10531 case Intrinsic::amdgcn_struct_buffer_store:
10532 case Intrinsic::amdgcn_struct_ptr_buffer_store:
10533 case Intrinsic::amdgcn_struct_buffer_store_format:
10534 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
10535 const bool IsFormat =
10536 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
10537 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
10545 VData = handleD16VData(VData, DAG);
10555 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10556 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10577 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
10580 M->getMemoryVT(),
M->getMemOperand());
10582 case Intrinsic::amdgcn_raw_buffer_load_lds:
10583 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
10584 case Intrinsic::amdgcn_struct_buffer_load_lds:
10585 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
10590 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
10591 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
10592 unsigned OpOffset = HasVIndex ? 1 : 0;
10593 SDValue VOffset =
Op.getOperand(5 + OpOffset);
10595 unsigned Size =
Op->getConstantOperandVal(4);
10601 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
10602 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
10603 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
10604 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
10607 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
10608 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
10609 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
10610 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
10613 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
10614 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
10615 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
10616 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
10621 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
10622 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
10623 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
10624 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
10629 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
10630 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
10631 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
10632 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
10640 if (HasVIndex && HasVOffset)
10644 else if (HasVIndex)
10646 else if (HasVOffset)
10649 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10654 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
10666 auto *
M = cast<MemSDNode>(
Op);
10696 case Intrinsic::amdgcn_load_to_lds:
10697 case Intrinsic::amdgcn_global_load_lds: {
10702 unsigned Size =
Op->getConstantOperandVal(4);
10707 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
10710 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
10713 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10718 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10723 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10739 if (
LHS->isDivergent())
10743 RHS.getOperand(0).getValueType() == MVT::i32) {
10746 VOffset =
RHS.getOperand(0);
10751 if (!
Addr->isDivergent()) {
10766 auto *
M = cast<MemSDNode>(
Op);
10769 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
10789 case Intrinsic::amdgcn_end_cf:
10791 Op->getOperand(2), Chain),
10793 case Intrinsic::amdgcn_s_barrier_init:
10794 case Intrinsic::amdgcn_s_barrier_signal_var: {
10801 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10802 ? AMDGPU::S_BARRIER_INIT_M0
10803 : AMDGPU::S_BARRIER_SIGNAL_M0;
10818 constexpr unsigned ShAmt = 16;
10830 case Intrinsic::amdgcn_s_barrier_join: {
10837 if (isa<ConstantSDNode>(BarOp)) {
10838 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10839 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10842 unsigned BarID = (BarVal >> 4) & 0x3F;
10847 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10863 case Intrinsic::amdgcn_s_prefetch_data: {
10866 return Op.getOperand(0);
10869 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10871 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
10878 Op->getVTList(), Ops,
M->getMemoryVT(),
10879 M->getMemOperand());
10884 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10894 Addr->getFlags().hasNoUnsignedWrap()) ||
10909std::pair<SDValue, SDValue>
10916 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10924 C1 = cast<ConstantSDNode>(N0.
getOperand(1));
10939 unsigned Overflow = ImmOffset & ~MaxImm;
10940 ImmOffset -= Overflow;
10941 if ((int32_t)Overflow < 0) {
10942 Overflow += ImmOffset;
10947 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
10951 SDValue Ops[] = {N0, OverflowVal};
10966void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
10968 Align Alignment)
const {
10971 if (
auto *
C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10974 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10985 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10987 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
11004SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
11007 return MaybePointer;
11021 SDValue NumRecords =
Op->getOperand(3);
11024 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11027 std::optional<uint32_t> ConstStride = std::nullopt;
11028 if (
auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
11029 ConstStride = ConstNode->getZExtValue();
11032 if (!ConstStride || *ConstStride != 0) {
11035 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
11046 NewHighHalf, NumRecords, Flags);
11056 bool IsTFE)
const {
11066 SDValue Op = getMemIntrinsicNode(
Opc,
DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11094 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11098 Ops[1] = BufferStoreExt;
11103 M->getMemOperand());
11128 DAGCombinerInfo &DCI)
const {
11144 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
11151 "unexpected vector extload");
11164 "unexpected fp extload");
11182 DCI.AddToWorklist(Cvt.
getNode());
11187 DCI.AddToWorklist(Cvt.
getNode());
11198 if (
Info.isEntryFunction())
11199 return Info.getUserSGPRInfo().hasFlatScratchInit();
11207 EVT MemVT =
Load->getMemoryVT();
11220 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11248 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
11249 "Custom lowering for non-i32 vectors hasn't been implemented.");
11252 unsigned AS =
Load->getAddressSpace();
11276 Alignment >=
Align(4) && NumElements < 32) {
11290 if (NumElements > 4)
11309 if (NumElements > 2)
11314 if (NumElements > 4)
11326 auto Flags =
Load->getMemOperand()->getFlags();
11328 Load->getAlign(), Flags, &
Fast) &&
11337 MemVT, *
Load->getMemOperand())) {
11346 EVT VT =
Op.getValueType();
11383 EVT VT =
Op.getValueType();
11386 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
11392 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11395 if (CLHS->isExactlyValue(1.0)) {
11412 if (CLHS->isExactlyValue(-1.0)) {
11421 if (!AllowInaccurateRcp &&
11422 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
11436 EVT VT =
Op.getValueType();
11439 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
11440 if (!AllowInaccurateDiv)
11461 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
11475 return DAG.
getNode(Opcode, SL, VTList,
11484 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
11498 return DAG.
getNode(Opcode, SL, VTList,
11504 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
11505 return FastLowered;
11508 EVT VT =
Op.getValueType();
11515 if (VT == MVT::bf16) {
11538 unsigned FMADOpCode =
11545 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11547 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
11548 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11571 const APFloat K0Val(0x1p+96f);
11574 const APFloat K1Val(0x1p-32f);
11601 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
11602 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
11603 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
11608 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
11609 return FastLowered;
11616 Flags.setNoFPExcept(
true);
11637 using namespace AMDGPU::Hwreg;
11638 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
11646 const bool HasDynamicDenormals =
11652 if (!PreservesDenormals) {
11660 if (HasDynamicDenormals) {
11664 SavedDenormMode =
SDValue(GetReg, 0);
11672 const SDValue EnableDenormValue =
11679 const SDValue EnableDenormValue =
11681 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
11682 {EnableDenormValue,
BitField, Glue});
11692 ApproxRcp, One, NegDivScale0, Flags);
11695 ApproxRcp, Fma0, Flags);
11701 NumeratorScaled,
Mul, Flags);
11707 NumeratorScaled, Fma3, Flags);
11709 if (!PreservesDenormals) {
11721 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
11722 const SDValue DisableDenormValue =
11723 HasDynamicDenormals
11728 AMDGPU::S_SETREG_B32, SL, MVT::Other,
11739 {Fma4, Fma1, Fma3, Scale},
Flags);
11745 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
11746 return FastLowered;
11814 EVT VT =
Op.getValueType();
11816 if (VT == MVT::f32)
11817 return LowerFDIV32(
Op, DAG);
11819 if (VT == MVT::f64)
11820 return LowerFDIV64(
Op, DAG);
11822 if (VT == MVT::f16 || VT == MVT::bf16)
11823 return LowerFDIV16(
Op, DAG);
11832 EVT ResultExpVT =
Op->getValueType(1);
11833 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11863 if (VT == MVT::i1) {
11867 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
11871 Store->getValue().getValueType().getScalarType() == MVT::i32);
11873 unsigned AS =
Store->getAddressSpace();
11892 if (NumElements > 4)
11899 VT, *
Store->getMemOperand()))
11909 if (NumElements > 2)
11913 if (NumElements > 4 ||
11922 auto Flags =
Store->getMemOperand()->getFlags();
11957 MVT VT =
Op.getValueType().getSimpleVT();
12128 EVT VT =
Op.getValueType();
12145 switch (
Op.getOpcode()) {
12172 EVT VT =
Op.getValueType();
12180 Op->getVTList(), Ops, VT,
12189SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
12190 DAGCombinerInfo &DCI)
const {
12191 EVT VT =
N->getValueType(0);
12193 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12200 EVT SrcVT = Src.getValueType();
12206 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12209 DCI.AddToWorklist(Cvt.
getNode());
12212 if (ScalarVT != MVT::f32) {
12224 DAGCombinerInfo &DCI)
const {
12225 SDValue MagnitudeOp =
N->getOperand(0);
12226 SDValue SignOp =
N->getOperand(1);
12254 for (
unsigned I = 0;
I != NumElts; ++
I) {
12278 if (NewElts.
size() == 1)
12300 for (
unsigned I = 0;
I != NumElts; ++
I) {
12335SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
12337 DAGCombinerInfo &DCI)
const {
12367 AM.HasBaseReg =
true;
12368 AM.BaseOffs =
Offset.getSExtValue();
12373 EVT VT =
N->getValueType(0);
12379 Flags.setNoUnsignedWrap(
12380 N->getFlags().hasNoUnsignedWrap() &&
12390 switch (
N->getOpcode()) {
12401 DAGCombinerInfo &DCI)
const {
12409 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
12410 N->getMemoryVT(), DCI);
12414 NewOps[PtrIdx] = NewPtr;
12423 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
12424 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
12433SDValue SITargetLowering::splitBinaryBitConstantOp(
12459 if (V.getValueType() != MVT::i1)
12461 switch (V.getOpcode()) {
12478 return V.getResNo() == 1;
12480 unsigned IntrinsicID = V.getConstantOperandVal(0);
12481 switch (IntrinsicID) {
12482 case Intrinsic::amdgcn_is_shared:
12483 case Intrinsic::amdgcn_is_private:
12500 if (!(
C & 0x000000ff))
12501 ZeroByteMask |= 0x000000ff;
12502 if (!(
C & 0x0000ff00))
12503 ZeroByteMask |= 0x0000ff00;
12504 if (!(
C & 0x00ff0000))
12505 ZeroByteMask |= 0x00ff0000;
12506 if (!(
C & 0xff000000))
12507 ZeroByteMask |= 0xff000000;
12508 uint32_t NonZeroByteMask = ~ZeroByteMask;
12509 if ((NonZeroByteMask &
C) != NonZeroByteMask)
12522 assert(V.getValueSizeInBits() == 32);
12524 if (V.getNumOperands() != 2)
12533 switch (V.getOpcode()) {
12538 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
12543 return (0x03020100 & ~ConstMask) | ConstMask;
12550 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
12556 return uint32_t(0x0c0c0c0c03020100ull >>
C);
12563 DAGCombinerInfo &DCI)
const {
12564 if (DCI.isBeforeLegalize())
12568 EVT VT =
N->getValueType(0);
12573 if (VT == MVT::i64 && CRHS) {
12579 if (CRHS && VT == MVT::i32) {
12588 if (
auto *CShift = dyn_cast<ConstantSDNode>(
LHS->getOperand(1))) {
12589 unsigned Shift = CShift->getZExtValue();
12591 unsigned Offset = NB + Shift;
12592 if ((
Offset & (Bits - 1)) == 0) {
12610 isa<ConstantSDNode>(
LHS.getOperand(2))) {
12616 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
12631 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
12636 if (
X !=
LHS.getOperand(1))
12641 dyn_cast<ConstantFPSDNode>(
RHS.getOperand(1));
12674 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
12675 LHS.getOperand(0) ==
LHS.getOperand(1))) {
12677 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
12678 :
Mask->getZExtValue() & OrdMask;
12699 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12702 if (LHSMask != ~0u && RHSMask != ~0u) {
12705 if (LHSMask > RHSMask) {
12712 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12713 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12716 if (!(LHSUsedLanes & RHSUsedLanes) &&
12719 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12726 for (
unsigned I = 0;
I < 32;
I += 8) {
12728 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
12729 Mask &= (0x0c <<
I) & 0xffffffff;
12787static const std::optional<ByteProvider<SDValue>>
12789 unsigned Depth = 0) {
12792 return std::nullopt;
12794 if (
Op.getValueSizeInBits() < 8)
12795 return std::nullopt;
12797 if (
Op.getValueType().isVector())
12800 switch (
Op->getOpcode()) {
12811 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
12812 NarrowVT = VTSign->getVT();
12815 return std::nullopt;
12818 if (SrcIndex >= NarrowByteWidth)
12819 return std::nullopt;
12825 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12827 return std::nullopt;
12829 uint64_t BitShift = ShiftOp->getZExtValue();
12831 if (BitShift % 8 != 0)
12832 return std::nullopt;
12834 SrcIndex += BitShift / 8;
12852static const std::optional<ByteProvider<SDValue>>
12854 unsigned StartingIndex = 0) {
12858 return std::nullopt;
12860 unsigned BitWidth =
Op.getScalarValueSizeInBits();
12862 return std::nullopt;
12864 return std::nullopt;
12866 bool IsVec =
Op.getValueType().isVector();
12867 switch (
Op.getOpcode()) {
12870 return std::nullopt;
12875 return std::nullopt;
12879 return std::nullopt;
12882 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
12883 return std::nullopt;
12884 if (!
LHS ||
LHS->isConstantZero())
12886 if (!
RHS ||
RHS->isConstantZero())
12888 return std::nullopt;
12893 return std::nullopt;
12895 auto *BitMaskOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12897 return std::nullopt;
12899 uint32_t BitMask = BitMaskOp->getZExtValue();
12901 uint32_t IndexMask = 0xFF << (Index * 8);
12903 if ((IndexMask & BitMask) != IndexMask) {
12906 if (IndexMask & BitMask)
12907 return std::nullopt;
12916 return std::nullopt;
12919 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12920 if (!ShiftOp ||
Op.getValueType().isVector())
12921 return std::nullopt;
12923 uint64_t BitsProvided =
Op.getValueSizeInBits();
12924 if (BitsProvided % 8 != 0)
12925 return std::nullopt;
12927 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12929 return std::nullopt;
12931 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12932 uint64_t ByteShift = BitShift / 8;
12934 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12935 uint64_t BytesProvided = BitsProvided / 8;
12936 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12937 NewIndex %= BytesProvided;
12944 return std::nullopt;
12946 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12948 return std::nullopt;
12950 uint64_t BitShift = ShiftOp->getZExtValue();
12952 return std::nullopt;
12954 auto BitsProvided =
Op.getScalarValueSizeInBits();
12955 if (BitsProvided % 8 != 0)
12956 return std::nullopt;
12958 uint64_t BytesProvided = BitsProvided / 8;
12959 uint64_t ByteShift = BitShift / 8;
12964 return BytesProvided - ByteShift > Index
12972 return std::nullopt;
12974 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12976 return std::nullopt;
12978 uint64_t BitShift = ShiftOp->getZExtValue();
12979 if (BitShift % 8 != 0)
12980 return std::nullopt;
12981 uint64_t ByteShift = BitShift / 8;
12987 return Index < ByteShift
12990 Depth + 1, StartingIndex);
12999 return std::nullopt;
13006 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
13007 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13009 if (NarrowBitWidth % 8 != 0)
13010 return std::nullopt;
13011 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13013 if (Index >= NarrowByteWidth)
13015 ? std::optional<ByteProvider<SDValue>>(
13023 return std::nullopt;
13027 if (NarrowByteWidth >= Index) {
13032 return std::nullopt;
13039 return std::nullopt;
13043 auto *L = cast<LoadSDNode>(
Op.getNode());
13045 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13046 if (NarrowBitWidth % 8 != 0)
13047 return std::nullopt;
13048 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13053 if (Index >= NarrowByteWidth) {
13055 ? std::optional<ByteProvider<SDValue>>(
13060 if (NarrowByteWidth > Index) {
13064 return std::nullopt;
13069 return std::nullopt;
13072 Depth + 1, StartingIndex);
13076 auto *IdxOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
13078 return std::nullopt;
13079 auto VecIdx = IdxOp->getZExtValue();
13080 auto ScalarSize =
Op.getScalarValueSizeInBits();
13081 if (ScalarSize < 32)
13082 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13084 StartingIndex, Index);
13089 return std::nullopt;
13091 auto *PermMask = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
13093 return std::nullopt;
13096 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13097 if (IdxMask > 0x07 && IdxMask != 0x0c)
13098 return std::nullopt;
13100 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13101 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13103 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
13109 return std::nullopt;
13124 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
13128 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13131 auto MemVT = L->getMemoryVT();
13134 return L->getMemoryVT().getSizeInBits() == 16;
13144 int Low8 = Mask & 0xff;
13145 int Hi8 = (Mask & 0xff00) >> 8;
13147 assert(Low8 < 8 && Hi8 < 8);
13149 bool IsConsecutive = (Hi8 - Low8 == 1);
13154 bool Is16Aligned = !(Low8 % 2);
13156 return IsConsecutive && Is16Aligned;
13164 int Low16 = PermMask & 0xffff;
13165 int Hi16 = (PermMask & 0xffff0000) >> 16;
13175 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13177 if (!OtherOpIs16Bit)
13185 unsigned DWordOffset) {
13188 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13190 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13195 if (Src.getValueType().isVector()) {
13196 auto ScalarTySize = Src.getScalarValueSizeInBits();
13197 auto ScalarTy = Src.getValueType().getScalarType();
13198 if (ScalarTySize == 32) {
13202 if (ScalarTySize > 32) {
13205 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13206 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13213 assert(ScalarTySize < 32);
13214 auto NumElements =
TypeSize / ScalarTySize;
13215 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13216 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13217 auto NumElementsIn32 = 32 / ScalarTySize;
13218 auto NumAvailElements = DWordOffset < Trunc32Elements
13220 : NumElements - NormalizedTrunc;
13233 auto ShiftVal = 32 * DWordOffset;
13241 [[maybe_unused]]
EVT VT =
N->getValueType(0);
13246 for (
int i = 0; i < 4; i++) {
13248 std::optional<ByteProvider<SDValue>>
P =
13251 if (!
P ||
P->isConstantZero())
13256 if (PermNodes.
size() != 4)
13259 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13260 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13262 for (
size_t i = 0; i < PermNodes.
size(); i++) {
13263 auto PermOp = PermNodes[i];
13266 int SrcByteAdjust = 4;
13270 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13271 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13273 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13274 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13278 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13279 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13282 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13284 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13287 SDValue Op = *PermNodes[FirstSrc.first].Src;
13289 assert(
Op.getValueSizeInBits() == 32);
13293 int Low16 = PermMask & 0xffff;
13294 int Hi16 = (PermMask & 0xffff0000) >> 16;
13296 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13297 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13300 if (WellFormedLow && WellFormedHi)
13304 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
13313 assert(
Op.getValueType().isByteSized() &&
13331 DAGCombinerInfo &DCI)
const {
13336 EVT VT =
N->getValueType(0);
13337 if (VT == MVT::i1) {
13342 if (Src !=
RHS.getOperand(0))
13347 if (!CLHS || !CRHS)
13351 static const uint32_t MaxMask = 0x3ff;
13366 isa<ConstantSDNode>(
LHS.getOperand(2))) {
13371 Sel |=
LHS.getConstantOperandVal(2);
13380 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13384 auto usesCombinedOperand = [](
SDNode *OrUse) {
13387 !OrUse->getValueType(0).isVector())
13391 for (
auto *VUser : OrUse->users()) {
13392 if (!VUser->getValueType(0).isVector())
13399 if (VUser->getOpcode() == VectorwiseOp)
13405 if (!
any_of(
N->users(), usesCombinedOperand))
13411 if (LHSMask != ~0u && RHSMask != ~0u) {
13414 if (LHSMask > RHSMask) {
13421 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13422 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13425 if (!(LHSUsedLanes & RHSUsedLanes) &&
13428 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13430 LHSMask &= ~RHSUsedLanes;
13431 RHSMask &= ~LHSUsedLanes;
13433 LHSMask |= LHSUsedLanes & 0x04040404;
13443 if (LHSMask == ~0u || RHSMask == ~0u) {
13449 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
13464 if (SrcVT == MVT::i32) {
13469 DCI.AddToWorklist(LowOr.
getNode());
13470 DCI.AddToWorklist(HiBits.getNode());
13478 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(
N->getOperand(1));
13481 N->getOperand(0), CRHS))
13489 DAGCombinerInfo &DCI)
const {
13490 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
13499 EVT VT =
N->getValueType(0);
13500 if (CRHS && VT == MVT::i64) {
13522 LHS->getOperand(0), FNegLHS, FNegRHS);
13531 DAGCombinerInfo &DCI)
const {
13536 EVT VT =
N->getValueType(0);
13537 if (VT != MVT::i32)
13541 if (Src.getValueType() != MVT::i16)
13548SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
13549 DAGCombinerInfo &DCI)
const {
13551 auto *VTSign = cast<VTSDNode>(
N->getOperand(1));
13556 VTSign->getVT() == MVT::i8) ||
13558 VTSign->getVT() == MVT::i16))) {
13560 "s_buffer_load_{u8, i8} are supported "
13561 "in GFX12 (or newer) architectures.");
13562 EVT VT = Src.getValueType();
13567 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
13573 auto *
M = cast<MemSDNode>(Src);
13574 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
13575 Opc,
DL, ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
13580 VTSign->getVT() == MVT::i8) ||
13582 VTSign->getVT() == MVT::i16)) &&
13584 auto *
M = cast<MemSDNode>(Src);
13585 SDValue Ops[] = {Src.getOperand(0),
13591 Src.getOperand(6), Src.getOperand(7)};
13594 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
13598 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
13599 Opc,
SDLoc(
N), ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
13600 return DCI.DAG.getMergeValues(
13607 DAGCombinerInfo &DCI)
const {
13615 if (
N->getOperand(0).isUndef())
13622 DAGCombinerInfo &DCI)
const {
13623 EVT VT =
N->getValueType(0);
13648 unsigned MaxDepth)
const {
13649 unsigned Opcode =
Op.getOpcode();
13653 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(
Op)) {
13654 const auto &
F = CFP->getValueAPF();
13655 if (
F.isNaN() &&
F.isSignaling())
13657 if (!
F.isDenormal())
13720 if (
Op.getValueType() == MVT::i32) {
13725 if (
auto *
RHS = dyn_cast<ConstantSDNode>(
Op.getOperand(1))) {
13726 if (
RHS->getZExtValue() == 0xffff0000) {
13736 return Op.getValueType().getScalarType() != MVT::f16;
13806 if (
Op.getValueType() == MVT::i16) {
13817 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
13819 switch (IntrinsicID) {
13820 case Intrinsic::amdgcn_cvt_pkrtz:
13821 case Intrinsic::amdgcn_cubeid:
13822 case Intrinsic::amdgcn_frexp_mant:
13823 case Intrinsic::amdgcn_fdot2:
13824 case Intrinsic::amdgcn_rcp:
13825 case Intrinsic::amdgcn_rsq:
13826 case Intrinsic::amdgcn_rsq_clamp:
13827 case Intrinsic::amdgcn_rcp_legacy:
13828 case Intrinsic::amdgcn_rsq_legacy:
13829 case Intrinsic::amdgcn_trig_preop:
13830 case Intrinsic::amdgcn_tanh:
13831 case Intrinsic::amdgcn_log:
13832 case Intrinsic::amdgcn_exp2:
13833 case Intrinsic::amdgcn_sqrt:
13851 unsigned MaxDepth)
const {
13854 unsigned Opcode =
MI->getOpcode();
13856 if (Opcode == AMDGPU::G_FCANONICALIZE)
13859 std::optional<FPValueAndVReg> FCR;
13862 if (FCR->Value.isSignaling())
13864 if (!FCR->Value.isDenormal())
13875 case AMDGPU::G_FADD:
13876 case AMDGPU::G_FSUB:
13877 case AMDGPU::G_FMUL:
13878 case AMDGPU::G_FCEIL:
13879 case AMDGPU::G_FFLOOR:
13880 case AMDGPU::G_FRINT:
13881 case AMDGPU::G_FNEARBYINT:
13882 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13883 case AMDGPU::G_INTRINSIC_TRUNC:
13884 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13885 case AMDGPU::G_FMA:
13886 case AMDGPU::G_FMAD:
13887 case AMDGPU::G_FSQRT:
13888 case AMDGPU::G_FDIV:
13889 case AMDGPU::G_FREM:
13890 case AMDGPU::G_FPOW:
13891 case AMDGPU::G_FPEXT:
13892 case AMDGPU::G_FLOG:
13893 case AMDGPU::G_FLOG2:
13894 case AMDGPU::G_FLOG10:
13895 case AMDGPU::G_FPTRUNC:
13896 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13897 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13898 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13899 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13900 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13902 case AMDGPU::G_FNEG:
13903 case AMDGPU::G_FABS:
13904 case AMDGPU::G_FCOPYSIGN:
13906 case AMDGPU::G_FMINNUM:
13907 case AMDGPU::G_FMAXNUM:
13908 case AMDGPU::G_FMINNUM_IEEE:
13909 case AMDGPU::G_FMAXNUM_IEEE:
13910 case AMDGPU::G_FMINIMUM:
13911 case AMDGPU::G_FMAXIMUM:
13912 case AMDGPU::G_FMINIMUMNUM:
13913 case AMDGPU::G_FMAXIMUMNUM: {
13921 case AMDGPU::G_BUILD_VECTOR:
13926 case AMDGPU::G_INTRINSIC:
13927 case AMDGPU::G_INTRINSIC_CONVERGENT:
13929 case Intrinsic::amdgcn_fmul_legacy:
13930 case Intrinsic::amdgcn_fmad_ftz:
13931 case Intrinsic::amdgcn_sqrt:
13932 case Intrinsic::amdgcn_fmed3:
13933 case Intrinsic::amdgcn_sin:
13934 case Intrinsic::amdgcn_cos:
13935 case Intrinsic::amdgcn_log:
13936 case Intrinsic::amdgcn_exp2:
13937 case Intrinsic::amdgcn_log_clamp:
13938 case Intrinsic::amdgcn_rcp:
13939 case Intrinsic::amdgcn_rcp_legacy:
13940 case Intrinsic::amdgcn_rsq:
13941 case Intrinsic::amdgcn_rsq_clamp:
13942 case Intrinsic::amdgcn_rsq_legacy:
13943 case Intrinsic::amdgcn_div_scale:
13944 case Intrinsic::amdgcn_div_fmas:
13945 case Intrinsic::amdgcn_div_fixup:
13946 case Intrinsic::amdgcn_fract:
13947 case Intrinsic::amdgcn_cvt_pkrtz:
13948 case Intrinsic::amdgcn_cubeid:
13949 case Intrinsic::amdgcn_cubema:
13950 case Intrinsic::amdgcn_cubesc:
13951 case Intrinsic::amdgcn_cubetc:
13952 case Intrinsic::amdgcn_frexp_mant:
13953 case Intrinsic::amdgcn_fdot2:
13954 case Intrinsic::amdgcn_trig_preop:
13955 case Intrinsic::amdgcn_tanh:
13974 if (
C.isDenormal()) {
13988 if (
C.isSignaling()) {
14007 return Op.isUndef() || isa<ConstantFPSDNode>(
Op);
14011SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
14012 DAGCombinerInfo &DCI)
const {
14015 EVT VT =
N->getValueType(0);
14024 EVT VT =
N->getValueType(0);
14025 return getCanonicalConstantFP(DAG,
SDLoc(
N), VT, CFP->getValueAPF());
14041 EVT EltVT =
Lo.getValueType();
14044 for (
unsigned I = 0;
I != 2; ++
I) {
14048 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14049 }
else if (
Op.isUndef()) {
14061 if (isa<ConstantFPSDNode>(NewElts[1]))
14062 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14068 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14122 if (!MinK || !MaxK)
14135 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->
hasMed3_16()))
14136 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14201 if (
Info->getMode().DX10Clamp) {
14210 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->
hasMed3_16())) {
14244 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
14255 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
14264 DAGCombinerInfo &DCI)
const {
14267 EVT VT =
N->getValueType(0);
14268 unsigned Opc =
N->getOpcode();
14297 if (
SDValue Med3 = performIntMed3ImmCombine(
14302 if (
SDValue Med3 = performIntMed3ImmCombine(
14308 if (
SDValue Med3 = performIntMed3ImmCombine(
14313 if (
SDValue Med3 = performIntMed3ImmCombine(
14328 (VT == MVT::f32 || VT == MVT::f64 ||
14334 if (
SDValue Res = performFPMed3ImmCombine(DAG,
SDLoc(
N), Op0, Op1))
14356 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
14357 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
14366 DAGCombinerInfo &DCI)
const {
14367 EVT VT =
N->getValueType(0);
14390 if (
Info->getMode().DX10Clamp) {
14393 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
14396 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
14399 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
14410 DAGCombinerInfo &DCI)
const {
14414 return DCI.DAG.getUNDEF(
N->getValueType(0));
14422 bool IsDivergentIdx,
14427 unsigned VecSize = EltSize * NumElem;
14430 if (VecSize <= 64 && EltSize < 32)
14439 if (IsDivergentIdx)
14443 unsigned NumInsts = NumElem +
14444 ((EltSize + 31) / 32) * NumElem ;
14449 return NumInsts <= 16;
14454 return NumInsts <= 15;
14461 if (isa<ConstantSDNode>(
Idx))
14475SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
14476 DAGCombinerInfo &DCI)
const {
14482 EVT ResVT =
N->getValueType(0);
14501 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
14529 DCI.AddToWorklist(Elt0.
getNode());
14530 DCI.AddToWorklist(Elt1.
getNode());
14552 if (!DCI.isBeforeLegalize())
14558 auto *
Idx = dyn_cast<ConstantSDNode>(
N->getOperand(1));
14559 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.
isByteSized() &&
14560 VecSize > 32 && VecSize % 32 == 0 &&
Idx) {
14563 unsigned BitIndex =
Idx->getZExtValue() * VecEltSize;
14564 unsigned EltIdx = BitIndex / 32;
14565 unsigned LeftoverBitIdx = BitIndex % 32;
14569 DCI.AddToWorklist(Cast.
getNode());
14573 DCI.AddToWorklist(Elt.
getNode());
14576 DCI.AddToWorklist(Srl.
getNode());
14580 DCI.AddToWorklist(Trunc.
getNode());
14582 if (VecEltVT == ResVT) {
14594SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
14595 DAGCombinerInfo &DCI)
const {
14609 EVT IdxVT =
Idx.getValueType();
14626 Src.getOperand(0).getValueType() == MVT::f16) {
14627 return Src.getOperand(0);
14630 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
14631 APFloat Val = CFP->getValueAPF();
14632 bool LosesInfo =
true;
14642 DAGCombinerInfo &DCI)
const {
14644 "combine only useful on gfx8");
14646 SDValue TruncSrc =
N->getOperand(0);
14647 EVT VT =
N->getValueType(0);
14648 if (VT != MVT::f16)
14686unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
14688 const SDNode *N1)
const {
14693 if (((VT == MVT::f32 &&
14695 (VT == MVT::f16 && Subtarget->
hasMadF16() &&
14715 EVT VT =
N->getValueType(0);
14716 if (VT != MVT::i32 && VT != MVT::i64)
14722 unsigned Opc =
N->getOpcode();
14777 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
14796 DAGCombinerInfo &DCI)
const {
14800 EVT VT =
N->getValueType(0);
14810 if (!
N->isDivergent() && Subtarget->
hasSMulHi())
14814 if (NumBits <= 32 || NumBits > 64)
14826 unsigned NumUsers = 0;
14830 if (!
User->isAnyAdd())
14854 bool MulSignedLo =
false;
14855 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
14864 if (VT != MVT::i64) {
14887 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
14889 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
14890 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
14892 if (!MulLHSUnsigned32) {
14899 if (!MulRHSUnsigned32) {
14910 if (VT != MVT::i64)
14916SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
14917 DAGCombinerInfo &DCI)
const {
14919 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14942 unsigned Opcode =
N->getOpcode();
14946 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
14957static std::optional<ByteProvider<SDValue>>
14960 if (!Byte0 || Byte0->isConstantZero()) {
14961 return std::nullopt;
14964 if (Byte1 && !Byte1->isConstantZero()) {
14965 return std::nullopt;
14971 unsigned FirstCs =
First & 0x0c0c0c0c;
14972 unsigned SecondCs = Second & 0x0c0c0c0c;
14973 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
14974 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14976 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14977 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14978 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14979 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14981 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15005 for (
int BPI = 0; BPI < 2; BPI++) {
15008 BPP = {Src1, Src0};
15010 unsigned ZeroMask = 0x0c0c0c0c;
15011 unsigned FMask = 0xFF << (8 * (3 - Step));
15013 unsigned FirstMask =
15014 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15015 unsigned SecondMask =
15016 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15020 int FirstGroup = -1;
15021 for (
int I = 0;
I < 2;
I++) {
15023 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
15024 return IterElt.SrcOp == *BPP.first.Src &&
15025 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15029 if (Match != Srcs.
end()) {
15030 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
15035 if (FirstGroup != -1) {
15037 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
15038 return IterElt.SrcOp == *BPP.second.Src &&
15039 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15042 if (Match != Srcs.
end()) {
15043 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
15045 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15053 unsigned ZeroMask = 0x0c0c0c0c;
15054 unsigned FMask = 0xFF << (8 * (3 - Step));
15058 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15062 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15071 if (Srcs.
size() == 1) {
15072 auto *Elt = Srcs.
begin();
15076 if (Elt->PermMask == 0x3020100)
15083 auto *FirstElt = Srcs.
begin();
15084 auto *SecondElt = std::next(FirstElt);
15091 auto FirstMask = FirstElt->PermMask;
15092 auto SecondMask = SecondElt->PermMask;
15094 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15095 unsigned FirstPlusFour = FirstMask | 0x04040404;
15098 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15110 FirstElt = std::next(SecondElt);
15111 if (FirstElt == Srcs.
end())
15114 SecondElt = std::next(FirstElt);
15117 if (SecondElt == Srcs.
end()) {
15123 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
15129 return Perms.
size() == 2
15135 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15136 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15137 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15138 EntryMask += ZeroMask;
15143 auto Opcode =
Op.getOpcode();
15149static std::optional<bool>
15160 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15163 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15165 assert(!(S0IsUnsigned && S0IsSigned));
15166 assert(!(S1IsUnsigned && S1IsSigned));
15174 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15180 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15181 return std::nullopt;
15193 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15194 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15199 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15205 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15206 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15207 return std::nullopt;
15213 DAGCombinerInfo &DCI)
const {
15215 EVT VT =
N->getValueType(0);
15222 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
15227 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
15231 if (VT == MVT::i64) {
15232 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15239 std::optional<bool> IsSigned;
15245 int ChainLength = 0;
15246 for (
int I = 0;
I < 4;
I++) {
15247 auto MulIdx =
isMul(LHS) ? 0 :
isMul(RHS) ? 1 : -1;
15250 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15253 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15258 TempNode->getOperand(MulIdx), *Src0, *Src1,
15259 TempNode->getOperand(MulIdx)->getOperand(0),
15260 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15264 IsSigned = *IterIsSigned;
15265 if (*IterIsSigned != *IsSigned)
15268 auto AddIdx = 1 - MulIdx;
15271 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
15272 Src2s.
push_back(TempNode->getOperand(AddIdx));
15282 TempNode->getOperand(AddIdx), *Src0, *Src1,
15283 TempNode->getOperand(AddIdx)->getOperand(0),
15284 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15288 if (*IterIsSigned != *IsSigned)
15292 ChainLength =
I + 2;
15296 TempNode = TempNode->getOperand(AddIdx);
15298 ChainLength =
I + 1;
15299 if (TempNode->getNumOperands() < 2)
15301 LHS = TempNode->getOperand(0);
15302 RHS = TempNode->getOperand(1);
15305 if (ChainLength < 2)
15311 if (ChainLength < 4) {
15321 bool UseOriginalSrc =
false;
15322 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
15323 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
15324 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
15325 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
15327 auto Src0Mask = Src0s.
begin()->PermMask;
15328 SrcBytes.
push_back(Src0Mask & 0xFF000000);
15329 bool UniqueEntries =
true;
15330 for (
auto I = 1;
I < 4;
I++) {
15331 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
15334 UniqueEntries =
false;
15340 if (UniqueEntries) {
15341 UseOriginalSrc =
true;
15343 auto *FirstElt = Src0s.
begin();
15347 auto *SecondElt = Src1s.
begin();
15349 SecondElt->DWordOffset);
15358 if (!UseOriginalSrc) {
15365 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
15368 : Intrinsic::amdgcn_udot4,
15378 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
15383 unsigned Opc =
LHS.getOpcode();
15395 auto Cond =
RHS.getOperand(0);
15417 DAGCombinerInfo &DCI)
const {
15420 EVT VT =
N->getValueType(0);
15442 DCI.AddToWorklist(Inner.
getNode());
15450 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
15459 if (VT == MVT::i64) {
15460 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15469 dyn_cast<GlobalAddressSDNode>(GAValue)) {
15476 DCI.AddToWorklist(Inner.
getNode());
15507 if (ZIsConstant != YIsConstant) {
15511 DCI.AddToWorklist(Inner.
getNode());
15519 assert(!YIsConstant && !ZIsConstant);
15521 if (!
X->isDivergent() &&
Y->isDivergent() !=
Z->isDivergent()) {
15530 if (
Y->isDivergent())
15533 DCI.AddToWorklist(UniformInner.
getNode());
15541 DAGCombinerInfo &DCI)
const {
15543 EVT VT =
N->getValueType(0);
15545 if (VT == MVT::i64) {
15546 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15550 if (VT != MVT::i32)
15559 unsigned Opc =
RHS.getOpcode();
15566 auto Cond =
RHS.getOperand(0);
15589SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
15590 DAGCombinerInfo &DCI)
const {
15592 if (
N->getValueType(0) != MVT::i32)
15603 unsigned LHSOpc =
LHS.getOpcode();
15604 unsigned Opc =
N->getOpcode();
15614 DAGCombinerInfo &DCI)
const {
15619 EVT VT =
N->getValueType(0);
15631 if (
A ==
LHS.getOperand(1)) {
15632 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
15633 if (FusedOp != 0) {
15635 return DAG.
getNode(FusedOp, SL, VT,
A, Two, RHS);
15643 if (
A ==
RHS.getOperand(1)) {
15644 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
15645 if (FusedOp != 0) {
15647 return DAG.
getNode(FusedOp, SL, VT,
A, Two, LHS);
15656 DAGCombinerInfo &DCI)
const {
15662 EVT VT =
N->getValueType(0);
15675 if (
A ==
LHS.getOperand(1)) {
15676 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
15677 if (FusedOp != 0) {
15681 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
15690 if (
A ==
RHS.getOperand(1)) {
15691 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
15692 if (FusedOp != 0) {
15694 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo, LHS);
15703 DAGCombinerInfo &DCI)
const {
15706 EVT VT =
N->getValueType(0);
15707 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->
has16BitInsts())
15720 bool IsNegative =
false;
15721 if (CLHS->isExactlyValue(1.0) ||
15722 (IsNegative = CLHS->isExactlyValue(-1.0))) {
15738 DAGCombinerInfo &DCI)
const {
15740 EVT VT =
N->getValueType(0);
15744 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
15745 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
15760 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
15775 if (ScalarVT == MVT::f32 &&
15781 if (TrueNodeExpVal == INT_MIN)
15784 if (FalseNodeExpVal == INT_MIN)
15804 DAGCombinerInfo &DCI)
const {
15806 EVT VT =
N->getValueType(0);
15827 (
N->getFlags().hasAllowContract() &&
15828 FMA->getFlags().hasAllowContract())) {
15862 if (Vec1 == Vec2 || Vec3 == Vec4)
15868 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
15877 DAGCombinerInfo &DCI)
const {
15883 EVT VT =
LHS.getValueType();
15884 ISD::CondCode CC = cast<CondCodeSDNode>(
N->getOperand(2))->get();
15886 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15888 CRHS = dyn_cast<ConstantSDNode>(LHS);
15912 return LHS.getOperand(0);
15918 isa<ConstantSDNode>(
LHS.getOperand(1)) &&
15919 isa<ConstantSDNode>(
LHS.getOperand(2)) &&
15920 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
15927 const APInt &CT =
LHS.getConstantOperandAPInt(1);
15928 const APInt &CF =
LHS.getConstantOperandAPInt(2);
15936 return LHS.getOperand(0);
15940 if (VT != MVT::f32 && VT != MVT::f64 &&
15956 const unsigned IsInfMask =
15958 const unsigned IsFiniteMask =
15972SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
15973 DAGCombinerInfo &DCI)
const {
15991 if (
auto *
C = dyn_cast<ConstantSDNode>(Shift.
getOperand(1))) {
15995 unsigned ShiftOffset = 8 *
Offset;
15997 ShiftOffset -=
C->getZExtValue();
15999 ShiftOffset +=
C->getZExtValue();
16001 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16003 MVT::f32, Shifted);
16014 DCI.AddToWorklist(
N);
16021 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
16027 DAGCombinerInfo &DCI)
const {
16037 return DCI.DAG.getConstantFP(Zero,
SDLoc(
N),
N->getValueType(0));
16040 APFloat One(
F.getSemantics(),
"1.0");
16042 return DCI.DAG.getConstantFP(One,
SDLoc(
N),
N->getValueType(0));
16048 DAGCombinerInfo &DCI)
const {
16069 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
16070 bool isInteger =
LHS.getValueType().isInteger();
16073 if (!isFloatingPoint && !isInteger)
16078 if (!isEquality && !isNonEquality)
16082 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16083 (isInteger && isa<ConstantSDNode>(RHS))) {
16086 }
else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16087 (isInteger && isa<ConstantSDNode>(LHS))) {
16095 if (isFloatingPoint) {
16096 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16101 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16108 if (!(isEquality && TrueVal == ConstVal) &&
16109 !(isNonEquality && FalseVal == ConstVal))
16112 SDValue SelectLHS = (isEquality &&
TrueVal == ConstVal) ? ArgVal : TrueVal;
16114 (isNonEquality &&
FalseVal == ConstVal) ? ArgVal : FalseVal;
16116 SelectLHS, SelectRHS);
16121 switch (
N->getOpcode()) {
16137 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
16147 switch (
N->getOpcode()) {
16149 return performAddCombine(
N, DCI);
16151 return performPtrAddCombine(
N, DCI);
16153 return performSubCombine(
N, DCI);
16156 return performAddCarrySubCarryCombine(
N, DCI);
16158 return performFAddCombine(
N, DCI);
16160 return performFSubCombine(
N, DCI);
16162 return performFDivCombine(
N, DCI);
16164 return performFMulCombine(
N, DCI);
16166 return performSetCCCombine(
N, DCI);
16168 if (
auto Res = performSelectCombine(
N, DCI))
16185 return performMinMaxCombine(
N, DCI);
16187 return performFMACombine(
N, DCI);
16189 return performAndCombine(
N, DCI);
16191 return performOrCombine(
N, DCI);
16194 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
16195 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16201 return performXorCombine(
N, DCI);
16203 return performZeroExtendCombine(
N, DCI);
16205 return performSignExtendInRegCombine(
N, DCI);
16207 return performClassCombine(
N, DCI);
16209 return performFCanonicalizeCombine(
N, DCI);
16211 return performRcpCombine(
N, DCI);
16226 return performUCharToFloatCombine(
N, DCI);
16228 return performFCopySignCombine(
N, DCI);
16233 return performCvtF32UByteNCombine(
N, DCI);
16235 return performFMed3Combine(
N, DCI);
16237 return performCvtPkRTZCombine(
N, DCI);
16239 return performClampCombine(
N, DCI);
16242 EVT VT =
N->getValueType(0);
16245 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16248 EVT EltVT = Src.getValueType();
16249 if (EltVT != MVT::i16)
16259 return performExtractVectorEltCombine(
N, DCI);
16261 return performInsertVectorEltCombine(
N, DCI);
16263 return performFPRoundCombine(
N, DCI);
16265 if (
SDValue Widened = widenLoad(cast<LoadSDNode>(
N), DCI))
16271 if (
MemSDNode *MemNode = dyn_cast<MemSDNode>(
N))
16272 return performMemSDNodeCombine(MemNode, DCI);
16303 unsigned Opcode =
Node->getMachineOpcode();
16306 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16307 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
16312 unsigned DmaskIdx =
16313 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16314 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
16315 unsigned NewDmask = 0;
16316 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16317 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16318 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
16319 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
16320 unsigned TFCLane = 0;
16321 bool HasChain =
Node->getNumValues() > 1;
16323 if (OldDmask == 0) {
16331 TFCLane = OldBitsSet;
16338 if (
Use.getResNo() != 0)
16344 if (!
User->isMachineOpcode() ||
16345 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
16357 if (UsesTFC && Lane == TFCLane) {
16362 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
16364 Dmask &= ~(1 << Comp);
16372 NewDmask |= 1 << Comp;
16377 bool NoChannels = !NewDmask;
16384 if (OldBitsSet == 1)
16390 if (NewDmask == OldDmask)
16399 unsigned NewChannels = BitsSet + UsesTFC;
16403 assert(NewOpcode != -1 &&
16404 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
16405 "failed to find equivalent MIMG op");
16413 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
16415 MVT ResultVT = NewChannels == 1
16418 : NewChannels == 5 ? 8
16432 if (NewChannels == 1) {
16442 for (
unsigned i = 0,
Idx = AMDGPU::sub0; i < 5; ++i) {
16447 if (i || !NoChannels)
16452 if (NewUser !=
User) {
16462 Idx = AMDGPU::sub1;
16465 Idx = AMDGPU::sub2;
16468 Idx = AMDGPU::sub3;
16471 Idx = AMDGPU::sub4;
16482 Op =
Op.getOperand(0);
16484 return isa<FrameIndexSDNode>(
Op);
16494 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
16495 SDValue SrcVal = Node->getOperand(2);
16503 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
16505 SDNode *Glued = Node->getGluedNode();
16507 Node->getOperand(0), SL, VReg, SrcVal,
16513 return ToResultReg.
getNode();
16518 for (
unsigned i = 0; i < Node->getNumOperands(); ++i) {
16526 Node->getOperand(i).getValueType(),
16527 Node->getOperand(i)),
16539 unsigned Opcode = Node->getMachineOpcode();
16541 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
16542 !
TII->isGather4(Opcode) &&
16544 return adjustWritemask(Node, DAG);
16547 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
16553 case AMDGPU::V_DIV_SCALE_F32_e64:
16554 case AMDGPU::V_DIV_SCALE_F64_e64: {
16558 SDValue Src0 = Node->getOperand(1);
16559 SDValue Src1 = Node->getOperand(3);
16560 SDValue Src2 = Node->getOperand(5);
16564 (Src0 == Src1 || Src0 == Src2))
16620 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
16621 unsigned InitIdx = 0;
16623 if (
TII->isImage(
MI)) {
16631 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
16632 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
16633 unsigned D16Val = D16 ? D16->getImm() : 0;
16635 if (!TFEVal && !LWEVal)
16646 assert(MO_Dmask &&
"Expected dmask operand in instruction");
16648 unsigned dmask = MO_Dmask->
getImm();
16655 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
16661 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
16662 if (DstSize < InitIdx)
16665 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
16673 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
16674 unsigned NewDst = 0;
16683 for (; SizeLeft; SizeLeft--, CurrIdx++) {
16684 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
16704 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
16717 if (
TII->isVOP3(
MI.getOpcode())) {
16719 TII->legalizeOperandsVOP3(
MRI,
MI);
16724 if (!
MI.getDesc().operands().empty()) {
16725 unsigned Opc =
MI.getOpcode();
16726 bool HasAGPRs =
Info->mayNeedAGPRs();
16728 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
16730 {AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src0),
16731 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1), Src2Idx}) {
16734 if ((
I == Src2Idx) && (HasAGPRs))
16737 if (!
Op.isReg() || !
Op.getReg().isVirtual())
16739 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
16740 if (!
TRI->hasAGPRs(RC))
16742 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
16743 if (!Src || !Src->isCopy() ||
16744 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
16746 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
16750 MRI.setRegClass(
Op.getReg(), NewRC);
16753 if (
TII->isMAI(
MI)) {
16758 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
16759 AMDGPU::OpName::scale_src0);
16760 if (Src0Idx != -1) {
16761 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
16762 AMDGPU::OpName::scale_src1);
16763 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
16764 TII->usesConstantBus(
MRI,
MI, Src1Idx))
16765 TII->legalizeOpWithMove(
MI, Src1Idx);
16773 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
16774 if (Src2->isReg() && Src2->getReg().isVirtual()) {
16775 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
16776 if (
TRI->isVectorSuperClass(RC)) {
16777 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
16778 MRI.setRegClass(Src2->getReg(), NewRC);
16779 if (Src2->isTied())
16780 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
16789 if (
TII->isImage(
MI))
16790 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
16864std::pair<unsigned, const TargetRegisterClass *>
16871 if (Constraint.
size() == 1) {
16875 if (VT == MVT::Other)
16878 switch (Constraint[0]) {
16885 RC = &AMDGPU::SReg_32RegClass;
16888 RC = &AMDGPU::SGPR_64RegClass;
16893 return std::pair(0U,
nullptr);
16901 : &AMDGPU::VGPR_32RegClass;
16906 return std::pair(0U,
nullptr);
16915 RC = &AMDGPU::AGPR_32RegClass;
16920 return std::pair(0U,
nullptr);
16929 RC = &AMDGPU::AV_32RegClass;
16932 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
16934 return std::pair(0U,
nullptr);
16943 return std::pair(0U, RC);
16946 if (Kind !=
'\0') {
16948 RC = &AMDGPU::VGPR_32RegClass;
16949 }
else if (Kind ==
's') {
16950 RC = &AMDGPU::SGPR_32RegClass;
16951 }
else if (Kind ==
'a') {
16952 RC = &AMDGPU::AGPR_32RegClass;
16958 return std::pair(0U,
nullptr);
16964 return std::pair(0U,
nullptr);
16968 RC =
TRI->getVGPRClassForBitWidth(Width);
16970 RC =
TRI->getSGPRClassForBitWidth(Width);
16972 RC =
TRI->getAGPRClassForBitWidth(Width);
16974 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
16979 return std::pair(0U,
nullptr);
16981 return std::pair(Reg, RC);
16987 return std::pair(0U,
nullptr);
16988 if (Idx < RC->getNumRegs())
16995 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
17001 if (Constraint.
size() == 1) {
17002 switch (Constraint[0]) {
17012 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
17020 if (Constraint.
size() == 1) {
17021 switch (Constraint[0]) {
17029 }
else if (Constraint.
size() == 2) {
17030 if (Constraint ==
"VA")
17041 Val = Val & maskTrailingOnes<uint64_t>(
Size);
17048 std::vector<SDValue> &Ops,
17063 unsigned Size =
Op.getScalarValueSizeInBits();
17071 Val =
C->getSExtValue();
17075 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17081 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
17084 Val =
C->getSExtValue();
17088 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17098 if (Constraint.
size() == 1) {
17099 switch (Constraint[0]) {
17103 return isInt<16>(Val);
17107 return isInt<32>(Val);
17114 }
else if (Constraint.
size() == 2) {
17115 if (Constraint ==
"DA") {
17116 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
17117 int64_t LoBits =
static_cast<int32_t
>(Val);
17121 if (Constraint ==
"DB") {
17129 unsigned MaxSize)
const {
17130 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
17133 MVT VT =
Op.getSimpleValueType();
17158 switch (UnalignedClassID) {
17159 case AMDGPU::VReg_64RegClassID:
17160 return AMDGPU::VReg_64_Align2RegClassID;
17161 case AMDGPU::VReg_96RegClassID:
17162 return AMDGPU::VReg_96_Align2RegClassID;
17163 case AMDGPU::VReg_128RegClassID:
17164 return AMDGPU::VReg_128_Align2RegClassID;
17165 case AMDGPU::VReg_160RegClassID:
17166 return AMDGPU::VReg_160_Align2RegClassID;
17167 case AMDGPU::VReg_192RegClassID:
17168 return AMDGPU::VReg_192_Align2RegClassID;
17169 case AMDGPU::VReg_224RegClassID:
17170 return AMDGPU::VReg_224_Align2RegClassID;
17171 case AMDGPU::VReg_256RegClassID:
17172 return AMDGPU::VReg_256_Align2RegClassID;
17173 case AMDGPU::VReg_288RegClassID:
17174 return AMDGPU::VReg_288_Align2RegClassID;
17175 case AMDGPU::VReg_320RegClassID:
17176 return AMDGPU::VReg_320_Align2RegClassID;
17177 case AMDGPU::VReg_352RegClassID:
17178 return AMDGPU::VReg_352_Align2RegClassID;
17179 case AMDGPU::VReg_384RegClassID:
17180 return AMDGPU::VReg_384_Align2RegClassID;
17181 case AMDGPU::VReg_512RegClassID:
17182 return AMDGPU::VReg_512_Align2RegClassID;
17183 case AMDGPU::VReg_1024RegClassID:
17184 return AMDGPU::VReg_1024_Align2RegClassID;
17185 case AMDGPU::AReg_64RegClassID:
17186 return AMDGPU::AReg_64_Align2RegClassID;
17187 case AMDGPU::AReg_96RegClassID:
17188 return AMDGPU::AReg_96_Align2RegClassID;
17189 case AMDGPU::AReg_128RegClassID:
17190 return AMDGPU::AReg_128_Align2RegClassID;
17191 case AMDGPU::AReg_160RegClassID:
17192 return AMDGPU::AReg_160_Align2RegClassID;
17193 case AMDGPU::AReg_192RegClassID:
17194 return AMDGPU::AReg_192_Align2RegClassID;
17195 case AMDGPU::AReg_256RegClassID:
17196 return AMDGPU::AReg_256_Align2RegClassID;
17197 case AMDGPU::AReg_512RegClassID:
17198 return AMDGPU::AReg_512_Align2RegClassID;
17199 case AMDGPU::AReg_1024RegClassID:
17200 return AMDGPU::AReg_1024_Align2RegClassID;
17216 if (
Info->isEntryFunction()) {
17223 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17225 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17226 :
TRI->getAlignedHighSGPRForRC(MF, 2,
17227 &AMDGPU::SGPR_64RegClass);
17228 Info->setSGPRForEXECCopy(SReg);
17231 Info->getStackPtrOffsetReg()));
17232 if (
Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17233 MRI.replaceRegWith(AMDGPU::SP_REG,
Info->getStackPtrOffsetReg());
17237 if (
Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17238 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG,
Info->getScratchRSrcReg());
17240 if (
Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17241 MRI.replaceRegWith(AMDGPU::FP_REG,
Info->getFrameOffsetReg());
17243 Info->limitOccupancy(MF);
17245 if (ST.isWave32() && !MF.
empty()) {
17246 for (
auto &
MBB : MF) {
17247 for (
auto &
MI :
MBB) {
17248 TII->fixImplicitOperands(
MI);
17258 if (ST.needsAlignedVGPRs()) {
17259 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
17265 if (NewClassID != -1)
17266 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
17275 const APInt &DemandedElts,
17277 unsigned Depth)
const {
17279 unsigned Opc =
Op.getOpcode();
17282 unsigned IID =
Op.getConstantOperandVal(0);
17284 case Intrinsic::amdgcn_mbcnt_lo:
17285 case Intrinsic::amdgcn_mbcnt_hi: {
17291 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17301 Op, Known, DemandedElts, DAG,
Depth);
17317 unsigned MaxValue =
17324 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
17328 unsigned Src1Cst = 0;
17329 if (Src1.
isImm()) {
17330 Src1Cst = Src1.
getImm();
17331 }
else if (Src1.
isReg()) {
17335 Src1Cst = Cst->Value.getZExtValue();
17343 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
17344 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
17346 if (Width >= BFEWidth)
17355 Known = Known.
sext(BFEWidth);
17357 Known = Known.
zext(BFEWidth);
17363 unsigned Depth)
const {
17366 switch (
MI->getOpcode()) {
17367 case AMDGPU::S_BFE_I32:
17370 case AMDGPU::S_BFE_U32:
17373 case AMDGPU::S_BFE_I64:
17376 case AMDGPU::S_BFE_U64:
17379 case AMDGPU::G_INTRINSIC:
17380 case AMDGPU::G_INTRINSIC_CONVERGENT: {
17383 case Intrinsic::amdgcn_workitem_id_x:
17386 case Intrinsic::amdgcn_workitem_id_y:
17389 case Intrinsic::amdgcn_workitem_id_z:
17392 case Intrinsic::amdgcn_mbcnt_lo:
17393 case Intrinsic::amdgcn_mbcnt_hi: {
17405 case Intrinsic::amdgcn_groupstaticsize: {
17416 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
17419 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
17422 case AMDGPU::G_AMDGPU_SMED3:
17423 case AMDGPU::G_AMDGPU_UMED3: {
17424 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
17451 unsigned Depth)
const {
17453 if (
auto *GI = dyn_cast<GIntrinsic>(
MI)) {
17460 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
17487 if (Header->getAlignment() != PrefAlign)
17488 return Header->getAlignment();
17490 unsigned LoopSize = 0;
17498 LoopSize +=
TII->getInstSizeInBytes(
MI);
17499 if (LoopSize > 192)
17504 if (LoopSize <= 64)
17507 if (LoopSize <= 128)
17508 return CacheLineAlign;
17514 auto I = Exit->getFirstNonDebugInstr();
17515 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
17516 return CacheLineAlign;
17525 if (PreTerm == Pre->
begin() ||
17526 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
17530 auto ExitHead = Exit->getFirstNonDebugInstr();
17531 if (ExitHead == Exit->end() ||
17532 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
17537 return CacheLineAlign;
17545 N =
N->getOperand(0).getNode();
17555 switch (
N->getOpcode()) {
17563 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
17564 return !
TRI->isSGPRReg(
MRI, Reg);
17570 return !
TRI->isSGPRReg(
MRI, Reg);
17574 unsigned AS = L->getAddressSpace();
17605 if (
auto *
A = dyn_cast<AtomicSDNode>(
N)) {
17607 return A->readMem() &&
A->writeMem();
17640 const APInt &DemandedElts,
17643 unsigned Depth)
const {
17648 if (
Info->getMode().DX10Clamp)
17660 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
17680 <<
"Hardware instruction generated for atomic "
17682 <<
" operation at memory scope " << MemScope;
17686 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
17687 Type *EltTy = VT->getElementType();
17688 return VT->getNumElements() == 2 &&
17707 if (
auto *
IT = dyn_cast<IntegerType>(Ty)) {
17708 unsigned BW =
IT->getBitWidth();
17709 return BW == 32 || BW == 64;
17721 if (
PointerType *PT = dyn_cast<PointerType>(Ty)) {
17723 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
17724 return BW == 32 || BW == 64;
17731 return VT->getNumElements() == 2 &&
17732 VT->getElementType()->getPrimitiveSizeInBits() == 16;
17742 bool HasSystemScope) {
17749 if (HasSystemScope) {
17758 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
17771 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
17789 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
17802 bool HasSystemScope =
17844 if (!HasSystemScope &&
17857 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
17865 ConstVal && ConstVal->isNullValue())
18085 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18086 return Subtarget->
isWave64() ? &AMDGPU::SReg_64RegClass
18087 : &AMDGPU::SReg_32RegClass;
18088 if (!
TRI->isSGPRClass(RC) && !isDivergent)
18089 return TRI->getEquivalentSGPRClass(RC);
18090 if (
TRI->isSGPRClass(RC) && isDivergent)
18091 return TRI->getEquivalentVGPRClass(RC);
18103 unsigned WaveSize) {
18108 if (!
IT ||
IT->getBitWidth() != WaveSize)
18111 if (!isa<Instruction>(V))
18113 if (!Visited.
insert(V).second)
18115 bool Result =
false;
18116 for (
const auto *U : V->users()) {
18117 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
18118 if (V == U->getOperand(1)) {
18119 switch (Intrinsic->getIntrinsicID()) {
18123 case Intrinsic::amdgcn_if_break:
18124 case Intrinsic::amdgcn_if:
18125 case Intrinsic::amdgcn_else:
18130 if (V == U->getOperand(0)) {
18131 switch (Intrinsic->getIntrinsicID()) {
18135 case Intrinsic::amdgcn_end_cf:
18136 case Intrinsic::amdgcn_loop:
18142 Result =
hasCFUser(U, Visited, WaveSize);
18151 const Value *V)
const {
18152 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
18153 if (CI->isInlineAsm()) {
18162 for (
auto &TC : TargetConstraints) {
18204 return MRI.hasOneNonDBGUse(N0);
18211 if (
I.getMetadata(
"amdgpu.noclobber"))
18213 if (
I.getMetadata(
"amdgpu.last.use"))
18223 if (!Def->isMachineOpcode())
18233 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18234 PhysReg = AMDGPU::SCC;
18236 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18291 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
18302 Alignment = RMW->getAlign();
18315 bool FullFlatEmulation =
18319 RMW->getType()->isDoubleTy()));
18322 bool ReturnValueIsUsed = !AI->
use_empty();
18331 if (FullFlatEmulation) {
18342 std::prev(BB->
end())->eraseFromParent();
18345 Value *LoadedShared =
nullptr;
18346 if (FullFlatEmulation) {
18348 {
Addr},
nullptr,
"is.shared");
18349 Builder.
CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
18357 LoadedShared = Clone;
18364 {
Addr},
nullptr,
"is.private");
18372 Value *LoadedPrivate;
18375 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
18378 LoadedPrivate, RMW->getValOperand());
18382 auto [ResultLoad, Equal] =
18397 if (FullFlatEmulation) {
18407 if (!FullFlatEmulation) {
18412 MDNode *RangeNotPrivate =
18415 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
18423 if (ReturnValueIsUsed) {
18426 if (FullFlatEmulation)
18441 if (
const auto *ConstVal = dyn_cast<Constant>(AI->
getValOperand());
18442 ConstVal && ConstVal->isNullValue()) {
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static uint32_t getIdentityValueForWaveReduction(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
bool hasBF16PackedInsts() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
bool hasBF16TransInsts() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
bool isFPPredicate() const
bool isIntPredicate() const
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMemoryAtomicFaddF32DenormalSupport() const
bool hasD16Images() const
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasAtomicFMinFMaxF64FlatInsts() const
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasAtomicFMinFMaxF32FlatInsts() const
bool hasIEEEMinimumMaximumInsts() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasRelaxedBufferOOBMode() const
bool hasBCNT(unsigned Size) const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDot1Insts() const
bool hasAtomicFaddRtnInsts() const
bool hasSafeSmemPrefetch() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
bool enableFlatScratch() const
bool hasMin3Max3PKF16() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool hasFmaMixBF16Insts() const
bool hasVMemToLDSLoad() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasBVHDualAndBVH8Insts() const
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
bool hasGloballyAddressableScratch() const
bool has64BitLiterals() const
TrapHandlerAbi getTrapHandlerAbi() const
const SIFrameLowering * getFrameLowering() const override
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasGFX10_AEncoding() const
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool hasEmulatedSystemScopeAtomics() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasIntMinMax64() const
bool hasShaderCyclesHiLoRegisters() const
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
bool hasVmemPrefInsts() const
bool hasAtomicFMinFMaxF64GlobalInsts() const
bool hasUnalignedScratchAccessEnabled() const
bool hasAtomicFlatPkAdd16Insts() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
bool supportsMinMaxDenormModes() const
bool hasAtomicFaddInsts() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
bool hasAtomicBufferPkAddBF16Inst() const
bool hasAtomicFaddNoRtnInsts() const
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDot8Insts() const
bool hasVectorMulU64() const
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasAtomicBufferGlobalPkAddF16Insts() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasAtomicGlobalPkAddBF16Inst() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
bool hasAddNoCarry() const
bool hasGWSAutoReplay() const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
BasicBlock::iterator GetInsertPoint() const
BasicBlock * GetInsertBlock() const
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
LLVMContext & getContext() const
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
bool isInlineConstant(const APInt &Imm) const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ Mul
Product of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const