35#include "llvm/IR/IntrinsicsAMDGPU.h"
36#include "llvm/IR/IntrinsicsR600.h"
38#define DEBUG_TYPE "amdgpu-legalinfo"
41using namespace LegalizeActions;
42using namespace LegalizeMutations;
43using namespace LegalityPredicates;
44using namespace MIPatternMatch;
48 "amdgpu-global-isel-new-legality",
49 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
50 "rules compatible with selection patterns"),
75 const LLT Ty = Query.Types[TypeIdx];
82 EltSize > 1 && EltSize < 32 &&
89 const LLT Ty = Query.Types[TypeIdx];
96 const LLT Ty = Query.Types[TypeIdx];
104 const LLT Ty = Query.Types[TypeIdx];
106 return std::pair(TypeIdx,
113 const LLT Ty = Query.Types[TypeIdx];
116 unsigned Pieces = (
Size + 63) / 64;
127 const LLT Ty = Query.Types[TypeIdx];
132 const int NextMul32 = (
Size + 31) / 32;
136 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
144 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
145 return std::make_pair(TypeIdx,
LLT::scalar(MemSize));
152 const LLT Ty = Query.Types[TypeIdx];
157 assert(EltSize == 32 || EltSize == 64);
162 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
166 return std::pair(TypeIdx,
199 const LLT Ty = Query.Types[TypeIdx];
206 const LLT Ty = Query.Types[TypeIdx];
216 const LLT QueryTy = Query.Types[TypeIdx];
223 const LLT QueryTy = Query.Types[TypeIdx];
230 const LLT QueryTy = Query.Types[TypeIdx];
236 return ((ST.useRealTrue16Insts() &&
Size == 16) ||
Size % 32 == 0) &&
242 return EltSize == 16 || EltSize % 32 == 0;
247 return EltSize == 32 || EltSize == 64 ||
249 EltSize == 128 || EltSize == 256;
278 LLT Ty = Query.Types[TypeIdx];
286 const LLT QueryTy = Query.Types[TypeIdx];
375 (ST.useRealTrue16Insts() && Ty ==
S16) ||
390 const LLT Ty = Query.Types[TypeIdx];
392 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.
getSizeInBits();
400 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
410 bool IsLoad,
bool IsAtomic) {
414 return ST.enableFlatScratch() ? 128 : 32;
416 return ST.useDS128() ? 128 : 64;
427 return IsLoad ? 512 : 128;
432 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
441 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
446 unsigned AS = Query.
Types[1].getAddressSpace();
460 if (IsLoad && MemSize <
Size)
461 MemSize = std::max(MemSize,
Align);
470 AtomicOrdering::NotAtomic))
481 if (!ST.hasDwordx3LoadStores())
494 if (AlignBits < MemSize) {
497 Align(AlignBits / 8)))
539 return EltSize != 32 && EltSize != 64;
554 if (
Size != MemSizeInBits)
570 uint64_t AlignInBits,
unsigned AddrSpace,
580 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
591 if (AlignInBits < RoundedSize)
598 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
605 if (Query.
MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
610 Query.
Types[1].getAddressSpace(), Opcode);
630 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
633 Register VectorReg =
MRI.createGenericVirtualRegister(VectorTy);
634 std::array<Register, 4> VectorElems;
635 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
636 for (
unsigned I = 0;
I < NumParts; ++
I)
638 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
639 B.buildMergeValues(MO, VectorElems);
643 Register BitcastReg =
MRI.createGenericVirtualRegister(VectorTy);
644 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
645 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
646 B.buildIntToPtr(MO, Scalar);
666 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
667 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
668 for (
unsigned I = 0;
I < NumParts; ++
I)
670 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
672 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
673 return B.buildBitcast(VectorTy, Scalar).getReg(0);
690 using namespace TargetOpcode;
692 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
705 const LLT BufferStridedPtr =
708 const LLT CodePtr = FlatPtr;
710 const std::initializer_list<LLT> AddrSpaces64 = {
711 GlobalPtr, ConstantPtr, FlatPtr
714 const std::initializer_list<LLT> AddrSpaces32 = {
715 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
718 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
720 const std::initializer_list<LLT> FPTypesBase = {
724 const std::initializer_list<LLT> FPTypes16 = {
728 const std::initializer_list<LLT> FPTypesPK16 = {
758 .clampMaxNumElementsStrict(0,
S16, 2)
766 .clampMaxNumElementsStrict(0,
S16, 2)
776 .clampMaxNumElementsStrict(0,
S16, 2)
784 .clampMaxNumElementsStrict(0,
S16, 2)
794 .minScalarOrElt(0,
S16)
811 .widenScalarToNextMultipleOf(0, 32)
833 .widenScalarToNextMultipleOf(0, 32)
841 .widenScalarToNextMultipleOf(0, 32);
852 .minScalarOrElt(0,
S32)
871 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
883 .clampMaxNumElements(0,
S8, 2)
904 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
916 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
923 .clampScalar(0,
S16,
S64);
956 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
957 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
980 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
1015 .legalFor(FPTypesPK16)
1029 .clampScalar(0,
S16,
S64);
1035 .clampScalar(1,
S32,
S32)
1054 .clampScalar(0,
S32,
S64);
1059 .clampScalar(0,
S32,
S64);
1065 .clampScalar(0,
S32,
S64)
1066 .clampScalar(1,
S32,
S32)
1073 .clampScalar(1,
S32,
S32)
1080 .clampMaxNumElements(0,
S16, 2);
1084 FPTruncActions.scalarize(0).lower();
1113 FMad.customFor({
S32,
S16});
1115 FMad.customFor({
S32});
1117 FMad.customFor({
S16});
1125 FRem.minScalar(0,
S32)
1134 .clampMaxNumElements(0,
S16, 2)
1145 .clampScalar(0,
S32,
S64)
1146 .widenScalarToNextPow2(1, 32);
1174 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1175 .clampScalar(0,
S16,
S64)
1179 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1185 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1189 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1190 .clampScalar(0,
S16,
S64)
1194 if (
ST.has16BitInsts()) {
1195 getActionDefinitionsBuilder(
1196 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1198 .clampScalar(0,
S16,
S64)
1201 getActionDefinitionsBuilder(
1202 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1204 .clampScalar(0,
S32,
S64)
1207 getActionDefinitionsBuilder(
1208 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1211 .clampScalar(0,
S32,
S64)
1215 getActionDefinitionsBuilder(G_PTR_ADD)
1216 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1219 .scalarSameSizeAs(1, 0);
1221 getActionDefinitionsBuilder(G_PTRMASK)
1223 .scalarSameSizeAs(1, 0)
1227 getActionDefinitionsBuilder(G_ICMP)
1238 .legalForCartesianProduct(
1239 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1240 .legalForCartesianProduct(
1241 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1242 if (
ST.has16BitInsts()) {
1243 CmpBuilder.legalFor({{
S1,
S16}});
1247 .widenScalarToNextPow2(1)
1248 .clampScalar(1,
S32,
S64)
1253 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1254 {
S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1256 if (
ST.hasSALUFloatInsts())
1257 FCmpBuilder.legalForCartesianProduct({
S32}, {
S16,
S32});
1260 .widenScalarToNextPow2(1)
1261 .clampScalar(1,
S32,
S64)
1265 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1266 if (
ST.has16BitInsts())
1267 ExpOps.customFor({{
S32}, {
S16}});
1269 ExpOps.customFor({
S32});
1270 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1273 getActionDefinitionsBuilder(G_FPOWI)
1274 .clampScalar(0, MinScalarFPTy,
S32)
1277 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1278 Log2Ops.customFor({
S32});
1279 if (
ST.has16BitInsts())
1280 Log2Ops.legalFor({
S16});
1282 Log2Ops.customFor({
S16});
1283 Log2Ops.scalarize(0)
1287 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1288 LogOps.customFor({
S32,
S16});
1289 LogOps.clampScalar(0, MinScalarFPTy,
S32)
1293 getActionDefinitionsBuilder(G_CTPOP)
1295 .clampScalar(0,
S32,
S32)
1296 .widenScalarToNextPow2(1, 32)
1297 .clampScalar(1,
S32,
S64)
1299 .widenScalarToNextPow2(0, 32);
1302 if (
ST.has16BitInsts())
1303 getActionDefinitionsBuilder(G_IS_FPCLASS)
1304 .legalForCartesianProduct({
S1}, FPTypes16)
1305 .widenScalarToNextPow2(1)
1309 getActionDefinitionsBuilder(G_IS_FPCLASS)
1310 .legalForCartesianProduct({
S1}, FPTypesBase)
1311 .lowerFor({
S1,
S16})
1312 .widenScalarToNextPow2(1)
1319 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1321 .clampScalar(0,
S32,
S32)
1322 .clampScalar(1,
S32,
S64)
1323 .widenScalarToNextPow2(0, 32)
1324 .widenScalarToNextPow2(1, 32)
1328 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1331 .clampScalar(0,
S32,
S32)
1332 .clampScalar(1,
S32,
S64)
1334 .widenScalarToNextPow2(0, 32)
1335 .widenScalarToNextPow2(1, 32);
1337 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1339 .clampScalar(0,
S32,
S32)
1340 .clampScalar(1,
S32,
S64)
1342 .widenScalarToNextPow2(0, 32)
1343 .widenScalarToNextPow2(1, 32);
1347 getActionDefinitionsBuilder(G_BITREVERSE)
1349 .clampScalar(0,
S32,
S64)
1351 .widenScalarToNextPow2(0);
1353 if (
ST.has16BitInsts()) {
1354 getActionDefinitionsBuilder(G_BSWAP)
1356 .clampMaxNumElementsStrict(0,
S16, 2)
1359 .widenScalarToNextPow2(0)
1360 .clampScalar(0,
S16,
S32)
1363 if (
ST.hasVOP3PInsts()) {
1364 getActionDefinitionsBuilder(G_ABS)
1366 .clampMaxNumElements(0,
S16, 2)
1368 .widenScalarToNextPow2(0)
1371 if (
ST.hasIntMinMax64()) {
1372 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1374 .clampMaxNumElements(0,
S16, 2)
1376 .widenScalarToNextPow2(0)
1380 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1382 .clampMaxNumElements(0,
S16, 2)
1384 .widenScalarToNextPow2(0)
1389 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1391 .widenScalarToNextPow2(0)
1398 getActionDefinitionsBuilder(G_BSWAP)
1403 .widenScalarToNextPow2(0)
1408 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1411 .widenScalarToNextPow2(0)
1416 getActionDefinitionsBuilder(G_INTTOPTR)
1418 .legalForCartesianProduct(AddrSpaces64, {
S64})
1419 .legalForCartesianProduct(AddrSpaces32, {
S32})
1432 getActionDefinitionsBuilder(G_PTRTOINT)
1434 .legalForCartesianProduct(AddrSpaces64, {
S64})
1435 .legalForCartesianProduct(AddrSpaces32, {
S32})
1448 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1452 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1453 bool IsLoad) ->
bool {
1457 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1471 unsigned NumRegs = (MemSize + 31) / 32;
1473 if (!
ST.hasDwordx3LoadStores())
1484 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1485 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1486 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1492 for (
unsigned Op : {G_LOAD, G_STORE}) {
1493 const bool IsStore =
Op == G_STORE;
1495 auto &Actions = getActionDefinitionsBuilder(
Op);
1498 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1501 {
S64, GlobalPtr,
S64, GlobalAlign32},
1504 {
S32, GlobalPtr,
S8, GlobalAlign8},
1505 {
S32, GlobalPtr,
S16, GlobalAlign16},
1507 {
S32, LocalPtr,
S32, 32},
1508 {
S64, LocalPtr,
S64, 32},
1510 {
S32, LocalPtr,
S8, 8},
1511 {
S32, LocalPtr,
S16, 16},
1514 {
S32, PrivatePtr,
S32, 32},
1515 {
S32, PrivatePtr,
S8, 8},
1516 {
S32, PrivatePtr,
S16, 16},
1519 {
S32, ConstantPtr,
S32, GlobalAlign32},
1522 {
S64, ConstantPtr,
S64, GlobalAlign32},
1523 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1532 Actions.unsupportedIf(
1533 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1547 Actions.customIf(
typeIs(1, Constant32Ptr));
1573 return !Query.
Types[0].isVector() &&
1574 needToSplitMemOp(Query,
Op == G_LOAD);
1576 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1581 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1584 if (DstSize > MemSize)
1590 if (MemSize > MaxSize)
1598 return Query.
Types[0].isVector() &&
1599 needToSplitMemOp(Query,
Op == G_LOAD);
1601 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1615 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1616 if (MemSize > MaxSize) {
1620 if (MaxSize % EltSize == 0) {
1626 unsigned NumPieces = MemSize / MaxSize;
1630 if (NumPieces == 1 || NumPieces >= NumElts ||
1631 NumElts % NumPieces != 0)
1632 return std::pair(0, EltTy);
1640 return std::pair(0, EltTy);
1655 return std::pair(0, EltTy);
1660 .widenScalarToNextPow2(0)
1666 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1667 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1668 {
S32, GlobalPtr,
S16, 2 * 8},
1669 {
S32, LocalPtr,
S8, 8},
1670 {
S32, LocalPtr,
S16, 16},
1671 {
S32, PrivatePtr,
S8, 8},
1672 {
S32, PrivatePtr,
S16, 16},
1673 {
S32, ConstantPtr,
S8, 8},
1674 {
S32, ConstantPtr,
S16, 2 * 8}})
1680 if (
ST.hasFlatAddressSpace()) {
1681 ExtLoads.legalForTypesWithMemDesc(
1682 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1690 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1692 ExtLoads.clampScalar(0,
S32,
S32)
1693 .widenScalarToNextPow2(0)
1696 auto &Atomics = getActionDefinitionsBuilder(
1697 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1698 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1699 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1700 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1701 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1702 {
S64, GlobalPtr}, {
S64, LocalPtr},
1703 {
S32, RegionPtr}, {
S64, RegionPtr}});
1704 if (
ST.hasFlatAddressSpace()) {
1705 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1709 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1710 if (
ST.hasLDSFPAtomicAddF32()) {
1711 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1712 if (
ST.hasLdsAtomicAddF64())
1713 Atomic.legalFor({{
S64, LocalPtr}});
1714 if (
ST.hasAtomicDsPkAdd16Insts())
1715 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1717 if (
ST.hasAtomicFaddInsts())
1718 Atomic.legalFor({{
S32, GlobalPtr}});
1719 if (
ST.hasFlatAtomicFaddF32Inst())
1720 Atomic.legalFor({{
S32, FlatPtr}});
1722 if (
ST.hasGFX90AInsts() ||
ST.hasGFX1250Insts()) {
1733 if (
ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1734 ST.hasAtomicBufferGlobalPkAddF16Insts())
1735 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1736 if (
ST.hasAtomicGlobalPkAddBF16Inst())
1737 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1738 if (
ST.hasAtomicFlatPkAdd16Insts())
1739 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1744 auto &AtomicFMinFMax =
1745 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1746 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1748 if (
ST.hasAtomicFMinFMaxF32GlobalInsts())
1749 AtomicFMinFMax.legalFor({{
F32, GlobalPtr},{
F32, BufferFatPtr}});
1750 if (
ST.hasAtomicFMinFMaxF64GlobalInsts())
1751 AtomicFMinFMax.legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1752 if (
ST.hasAtomicFMinFMaxF32FlatInsts())
1753 AtomicFMinFMax.legalFor({
F32, FlatPtr});
1754 if (
ST.hasAtomicFMinFMaxF64FlatInsts())
1755 AtomicFMinFMax.legalFor({
F64, FlatPtr});
1759 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1760 .customFor({{
S32, GlobalPtr}, {
S64, GlobalPtr},
1761 {
S32, FlatPtr}, {
S64, FlatPtr}})
1762 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1763 {
S32, RegionPtr}, {
S64, RegionPtr}});
1767 getActionDefinitionsBuilder(G_SELECT)
1769 LocalPtr, FlatPtr, PrivatePtr,
1773 .clampScalar(0,
S16,
S64)
1777 .clampMaxNumElements(0,
S32, 2)
1778 .clampMaxNumElements(0, LocalPtr, 2)
1779 .clampMaxNumElements(0, PrivatePtr, 2)
1781 .widenScalarToNextPow2(0)
1786 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1788 if (
ST.has16BitInsts()) {
1789 if (
ST.hasVOP3PInsts()) {
1791 .clampMaxNumElements(0,
S16, 2);
1793 Shifts.legalFor({{
S16,
S16}});
1796 Shifts.widenScalarIf(
1801 const LLT AmountTy = Query.
Types[1];
1806 Shifts.clampScalar(1,
S32,
S32);
1807 Shifts.widenScalarToNextPow2(0, 16);
1808 Shifts.clampScalar(0,
S16,
S64);
1810 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1818 Shifts.clampScalar(1,
S32,
S32);
1819 Shifts.widenScalarToNextPow2(0, 32);
1820 Shifts.clampScalar(0,
S32,
S64);
1822 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1827 Shifts.scalarize(0);
1829 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1830 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1831 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1832 unsigned IdxTypeIdx = 2;
1834 getActionDefinitionsBuilder(
Op)
1836 const LLT EltTy = Query.
Types[EltTypeIdx];
1837 const LLT VecTy = Query.
Types[VecTypeIdx];
1838 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1840 const bool isLegalVecType =
1850 return (EltSize == 32 || EltSize == 64) &&
1866 const LLT EltTy = Query.
Types[EltTypeIdx];
1867 const LLT VecTy = Query.
Types[VecTypeIdx];
1871 const unsigned TargetEltSize =
1872 DstEltSize % 64 == 0 ? 64 : 32;
1873 return std::pair(VecTypeIdx,
1877 .clampScalar(EltTypeIdx,
S32,
S64)
1878 .clampScalar(VecTypeIdx,
S32,
S64)
1879 .clampScalar(IdxTypeIdx,
S32,
S32)
1880 .clampMaxNumElements(VecTypeIdx,
S32, 32)
1889 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1891 const LLT &EltTy = Query.
Types[1].getElementType();
1892 return Query.
Types[0] != EltTy;
1895 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1896 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1897 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1900 getActionDefinitionsBuilder(
Op)
1906 const LLT BigTy = Query.
Types[BigTyIdx];
1911 const LLT BigTy = Query.
Types[BigTyIdx];
1912 const LLT LitTy = Query.
Types[LitTyIdx];
1918 const LLT BigTy = Query.
Types[BigTyIdx];
1924 const LLT LitTy = Query.
Types[LitTyIdx];
1929 .widenScalarToNextPow2(BigTyIdx, 32);
1934 getActionDefinitionsBuilder(G_BUILD_VECTOR)
1943 if (
ST.hasScalarPackInsts()) {
1946 .minScalarOrElt(0,
S16)
1949 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1953 BuildVector.customFor({
V2S16,
S16});
1954 BuildVector.minScalarOrElt(0,
S32);
1956 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1964 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1966 .clampMaxNumElements(0,
S32, 32)
1967 .clampMaxNumElements(1,
S16, 2)
1968 .clampMaxNumElements(0,
S16, 64);
1970 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1973 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1974 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1975 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
1977 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
1978 const LLT Ty = Query.
Types[TypeIdx];
1990 getActionDefinitionsBuilder(
Op)
1994 const LLT BigTy = Query.
Types[BigTyIdx];
2000 .widenScalarToNextPow2(LitTyIdx, 16)
2009 .clampScalar(LitTyIdx,
S32,
S512)
2010 .widenScalarToNextPow2(LitTyIdx, 32)
2014 return notValidElt(Query, LitTyIdx);
2019 return notValidElt(Query, BigTyIdx);
2024 if (
Op == G_MERGE_VALUES) {
2025 Builder.widenScalarIf(
2028 const LLT Ty = Query.
Types[LitTyIdx];
2034 Builder.widenScalarIf(
2036 const LLT Ty = Query.
Types[BigTyIdx];
2042 const LLT &Ty = Query.
Types[BigTyIdx];
2044 if (NewSizeInBits >= 256) {
2046 if (RoundedTo < NewSizeInBits)
2047 NewSizeInBits = RoundedTo;
2049 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
2058 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2059 .legalFor({{
S32}, {
S64}})
2060 .clampScalar(0,
S32,
S64);
2062 if (
ST.hasVOP3PInsts()) {
2063 SextInReg.lowerFor({{
V2S16}})
2067 .clampMaxNumElementsStrict(0,
S16, 2);
2068 }
else if (
ST.has16BitInsts()) {
2069 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2073 SextInReg.lowerFor({{
S32}, {
S64}});
2078 .clampScalar(0,
S32,
S64)
2081 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2086 getActionDefinitionsBuilder(G_FSHR)
2089 .clampMaxNumElementsStrict(0,
S16, 2)
2093 if (
ST.hasVOP3PInsts()) {
2094 getActionDefinitionsBuilder(G_FSHL)
2096 .clampMaxNumElementsStrict(0,
S16, 2)
2100 getActionDefinitionsBuilder(G_FSHL)
2105 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2108 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({
S64});
2110 getActionDefinitionsBuilder(G_FENCE)
2113 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2118 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2120 .clampScalar(1,
S32,
S32)
2121 .clampScalar(0,
S32,
S64)
2122 .widenScalarToNextPow2(0)
2125 getActionDefinitionsBuilder(
2129 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2130 G_READ_REGISTER, G_WRITE_REGISTER,
2135 if (
ST.hasIEEEMinimumMaximumInsts()) {
2136 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2137 .legalFor(FPTypesPK16)
2138 .clampMaxNumElements(0,
S16, 2)
2142 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2145 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2148 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2150 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2151 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2152 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2155 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2157 getActionDefinitionsBuilder(
2158 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2159 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2160 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2161 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2166 getLegacyLegalizerInfo().computeTables();
2176 switch (
MI.getOpcode()) {
2177 case TargetOpcode::G_ADDRSPACE_CAST:
2179 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2181 case TargetOpcode::G_FCEIL:
2183 case TargetOpcode::G_FREM:
2185 case TargetOpcode::G_INTRINSIC_TRUNC:
2187 case TargetOpcode::G_SITOFP:
2189 case TargetOpcode::G_UITOFP:
2191 case TargetOpcode::G_FPTOSI:
2193 case TargetOpcode::G_FPTOUI:
2195 case TargetOpcode::G_FMINNUM:
2196 case TargetOpcode::G_FMAXNUM:
2197 case TargetOpcode::G_FMINIMUMNUM:
2198 case TargetOpcode::G_FMAXIMUMNUM:
2199 case TargetOpcode::G_FMINNUM_IEEE:
2200 case TargetOpcode::G_FMAXNUM_IEEE:
2202 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2204 case TargetOpcode::G_INSERT_VECTOR_ELT:
2206 case TargetOpcode::G_FSIN:
2207 case TargetOpcode::G_FCOS:
2209 case TargetOpcode::G_GLOBAL_VALUE:
2211 case TargetOpcode::G_LOAD:
2212 case TargetOpcode::G_SEXTLOAD:
2213 case TargetOpcode::G_ZEXTLOAD:
2215 case TargetOpcode::G_STORE:
2217 case TargetOpcode::G_FMAD:
2219 case TargetOpcode::G_FDIV:
2221 case TargetOpcode::G_FFREXP:
2223 case TargetOpcode::G_FSQRT:
2225 case TargetOpcode::G_UDIV:
2226 case TargetOpcode::G_UREM:
2227 case TargetOpcode::G_UDIVREM:
2229 case TargetOpcode::G_SDIV:
2230 case TargetOpcode::G_SREM:
2231 case TargetOpcode::G_SDIVREM:
2233 case TargetOpcode::G_ATOMIC_CMPXCHG:
2235 case TargetOpcode::G_FLOG2:
2237 case TargetOpcode::G_FLOG:
2238 case TargetOpcode::G_FLOG10:
2240 case TargetOpcode::G_FEXP2:
2242 case TargetOpcode::G_FEXP:
2243 case TargetOpcode::G_FEXP10:
2245 case TargetOpcode::G_FPOW:
2247 case TargetOpcode::G_FFLOOR:
2249 case TargetOpcode::G_BUILD_VECTOR:
2250 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2252 case TargetOpcode::G_MUL:
2254 case TargetOpcode::G_CTLZ:
2255 case TargetOpcode::G_CTTZ:
2257 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2259 case TargetOpcode::G_STACKSAVE:
2261 case TargetOpcode::G_GET_FPENV:
2263 case TargetOpcode::G_SET_FPENV:
2265 case TargetOpcode::G_TRAP:
2267 case TargetOpcode::G_DEBUGTRAP:
2287 if (ST.hasApertureRegs()) {
2292 ? AMDGPU::SRC_SHARED_BASE
2293 : AMDGPU::SRC_PRIVATE_BASE;
2294 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2295 !ST.hasGloballyAddressableScratch()) &&
2296 "Cannot use src_private_base with globally addressable scratch!");
2305 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2306 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {
Register(ApertureRegNo)});
2307 return B.buildUnmerge(
S32, Dst).getReg(1);
2312 Register LoadAddr =
MRI.createGenericVirtualRegister(
2322 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2324 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
2338 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2341 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2344 Register QueuePtr =
MRI.createGenericVirtualRegister(
2360 B.buildObjectPtrOffset(
2362 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2363 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2371 switch (Def->getOpcode()) {
2372 case AMDGPU::G_FRAME_INDEX:
2373 case AMDGPU::G_GLOBAL_VALUE:
2374 case AMDGPU::G_BLOCK_ADDR:
2376 case AMDGPU::G_CONSTANT: {
2377 const ConstantInt *CI = Def->getOperand(1).getCImm();
2378 return CI->
getSExtValue() != TM.getNullPointerValue(AddrSpace);
2394 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2395 (isa<GIntrinsic>(
MI) && cast<GIntrinsic>(
MI).getIntrinsicID() ==
2396 Intrinsic::amdgcn_addrspacecast_nonnull));
2400 Register Src = isa<GIntrinsic>(
MI) ?
MI.getOperand(2).getReg()
2401 :
MI.getOperand(1).getReg();
2402 LLT DstTy =
MRI.getType(Dst);
2403 LLT SrcTy =
MRI.getType(Src);
2414 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2415 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2422 auto castFlatToLocalOrPrivate = [&](
const DstOp &Dst) ->
Register {
2428 Register SrcLo =
B.buildExtract(
S32, Src, 0).getReg(0);
2430 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
2431 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2433 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2435 return B.buildIntToPtr(Dst,
Sub).getReg(0);
2439 return B.buildExtract(Dst, Src, 0).getReg(0);
2445 castFlatToLocalOrPrivate(Dst);
2446 MI.eraseFromParent();
2450 unsigned NullVal = TM.getNullPointerValue(DestAS);
2452 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2453 auto FlatNull =
B.buildConstant(SrcTy, 0);
2456 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2460 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2462 MI.eraseFromParent();
2469 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2472 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2480 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {
S32})
2485 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {
S32})
2492 Register SrcHi =
B.buildShl(
S32, ThreadID, ShAmt).getReg(0);
2494 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).
getReg(0);
2498 B.buildInstr(AMDGPU::S_MOV_B64, {
S64},
2499 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2501 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2502 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2511 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2517 castLocalOrPrivateToFlat(Dst);
2518 MI.eraseFromParent();
2522 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2524 auto SegmentNull =
B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2525 auto FlatNull =
B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2528 SegmentNull.getReg(0));
2530 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2532 MI.eraseFromParent();
2539 B.buildExtract(Dst, Src, 0);
2540 MI.eraseFromParent();
2548 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2549 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2550 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2551 MI.eraseFromParent();
2558 MI.eraseFromParent();
2566 LLT Ty =
MRI.getType(Src);
2572 auto C1 =
B.buildFConstant(Ty, C1Val);
2573 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2576 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2577 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2579 auto C2 =
B.buildFConstant(Ty, C2Val);
2580 auto Fabs =
B.buildFAbs(Ty, Src);
2583 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2584 MI.eraseFromParent();
2602 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2604 const auto Zero =
B.buildFConstant(
S64, 0.0);
2605 const auto One =
B.buildFConstant(
S64, 1.0);
2608 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2609 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2612 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2613 MI.eraseFromParent();
2621 Register Src0Reg =
MI.getOperand(1).getReg();
2622 Register Src1Reg =
MI.getOperand(2).getReg();
2623 auto Flags =
MI.getFlags();
2624 LLT Ty =
MRI.getType(DstReg);
2626 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2627 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2628 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2629 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2630 MI.eraseFromParent();
2636 const unsigned FractBits = 52;
2637 const unsigned ExpBits = 11;
2640 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2641 auto Const1 =
B.buildConstant(
S32, ExpBits);
2643 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2645 .addUse(Const0.getReg(0))
2646 .addUse(Const1.getReg(0));
2648 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2662 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2669 const unsigned FractBits = 52;
2672 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2673 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2675 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2677 const auto Zero32 =
B.buildConstant(
S32, 0);
2680 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2682 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2683 auto Not =
B.buildNot(
S64, Shr);
2684 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2685 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2690 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2691 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2692 MI.eraseFromParent();
2708 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2709 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2711 if (
MRI.getType(Dst) ==
S64) {
2712 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2713 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2715 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2716 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2719 B.buildFAdd(Dst, LdExp, CvtLo);
2720 MI.eraseFromParent();
2726 auto One =
B.buildConstant(
S32, 1);
2730 auto ThirtyOne =
B.buildConstant(
S32, 31);
2731 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2732 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2733 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2734 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2735 .addUse(Unmerge.getReg(1));
2736 auto LS2 =
B.buildSub(
S32, LS, One);
2737 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2739 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2740 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2741 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2742 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2743 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2744 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2745 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2746 B.buildFLdexp(Dst, FVal, Scale);
2747 MI.eraseFromParent();
2764 const LLT SrcLT =
MRI.getType(Src);
2767 unsigned Flags =
MI.getFlags();
2778 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2786 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2787 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2791 K0 =
B.buildFConstant(
2792 S64, llvm::bit_cast<double>(UINT64_C( 0x3df0000000000000)));
2793 K1 =
B.buildFConstant(
2794 S64, llvm::bit_cast<double>(UINT64_C( 0xc1f0000000000000)));
2796 K0 =
B.buildFConstant(
2797 S32, llvm::bit_cast<float>(UINT32_C( 0x2f800000)));
2798 K1 =
B.buildFConstant(
2799 S32, llvm::bit_cast<float>(UINT32_C( 0xcf800000)));
2802 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2803 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2804 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2807 :
B.buildFPTOUI(
S32, FloorMul);
2808 auto Lo =
B.buildFPTOUI(
S32, Fma);
2812 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2814 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2817 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2818 MI.eraseFromParent();
2828 const bool IsIEEEOp =
MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2829 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2837 if (
MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
2838 MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
2860 LLT VecTy =
MRI.getType(Vec);
2873 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
2874 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
2875 B.buildIntToPtr(Dst, IntElt);
2877 MI.eraseFromParent();
2884 std::optional<ValueAndVReg> MaybeIdxVal =
2888 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2891 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2892 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2897 MI.eraseFromParent();
2912 LLT VecTy =
MRI.getType(Vec);
2926 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
2927 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
2928 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2930 B.buildIntToPtr(Dst, IntVecDest);
2931 MI.eraseFromParent();
2938 std::optional<ValueAndVReg> MaybeIdxVal =
2943 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2946 if (IdxVal < NumElts) {
2948 for (
unsigned i = 0; i < NumElts; ++i)
2949 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2950 B.buildUnmerge(SrcRegs, Vec);
2952 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2953 B.buildMergeLikeInstr(Dst, SrcRegs);
2958 MI.eraseFromParent();
2968 LLT Ty =
MRI.getType(DstReg);
2969 unsigned Flags =
MI.getFlags();
2974 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2975 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2976 .addUse(MulVal.getReg(0))
2980 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2983 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2987 MI.eraseFromParent();
2995 unsigned GAFlags)
const {
2996 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
3024 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3030 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3034 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3043 if (!
B.getMRI()->getRegClassOrNull(PCReg))
3044 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3047 B.buildExtract(DstReg, PCReg, 0);
3058 if (!
MRI.getRegClassOrNull(DstReg))
3059 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3060 B.buildInstr(AMDGPU::S_MOV_B64)
3070 Register AddrLo = !RequiresHighHalf && !
MRI.getRegClassOrNull(DstReg)
3072 :
MRI.createGenericVirtualRegister(
S32);
3074 if (!
MRI.getRegClassOrNull(AddrLo))
3075 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3078 B.buildInstr(AMDGPU::S_MOV_B32)
3083 if (RequiresHighHalf) {
3085 "Must provide a 64-bit pointer type!");
3088 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3090 B.buildInstr(AMDGPU::S_MOV_B32)
3100 if (!
MRI.getRegClassOrNull(AddrDst))
3101 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3103 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3107 if (AddrDst != DstReg)
3108 B.buildCast(DstReg, AddrDst);
3109 }
else if (AddrLo != DstReg) {
3112 B.buildCast(DstReg, AddrLo);
3120 LLT Ty =
MRI.getType(DstReg);
3129 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
3133 Fn,
"local memory global used by non-kernel function",
3142 B.buildUndef(DstReg);
3143 MI.eraseFromParent();
3163 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3167 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3168 B.buildIntToPtr(DstReg, Sz);
3169 MI.eraseFromParent();
3175 *cast<GlobalVariable>(GV)));
3176 MI.eraseFromParent();
3182 MI.eraseFromParent();
3190 MI.eraseFromParent();
3196 MI.eraseFromParent();
3201 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
3214 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3215 B.buildExtract(DstReg, Load, 0);
3217 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3219 MI.eraseFromParent();
3237 LLT PtrTy =
MRI.getType(PtrReg);
3242 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3244 MI.getOperand(1).setReg(Cast.getReg(0));
3249 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3253 LLT ValTy =
MRI.getType(ValReg);
3275 if (WideMemSize == ValSize) {
3281 MI.setMemRefs(MF, {WideMMO});
3287 if (ValSize > WideMemSize)
3294 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3295 B.buildTrunc(ValReg, WideLoad).getReg(0);
3302 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3303 B.buildExtract(ValReg, WideLoad, 0);
3307 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3308 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3312 MI.eraseFromParent();
3325 Register DataReg =
MI.getOperand(0).getReg();
3326 LLT DataTy =
MRI.getType(DataReg);
3340 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3369 "this should not have been custom lowered");
3371 LLT ValTy =
MRI.getType(CmpVal);
3374 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3376 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3380 .setMemRefs(
MI.memoperands());
3382 MI.eraseFromParent();
3391 case TargetOpcode::G_INTRINSIC: {
3393 case Intrinsic::amdgcn_frexp_mant:
3401 case TargetOpcode::G_FFREXP: {
3406 case TargetOpcode::G_FPEXT: {
3427std::pair<Register, Register>
3429 unsigned Flags)
const {
3434 auto SmallestNormal =
B.buildFConstant(
3436 auto IsLtSmallestNormal =
3439 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3440 auto One =
B.buildFConstant(
F32, 1.0);
3442 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3443 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3445 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3458 LLT Ty =
B.getMRI()->getType(Dst);
3459 unsigned Flags =
MI.getFlags();
3464 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3465 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3466 .addUse(Ext.getReg(0))
3468 B.buildFPTrunc(Dst,
Log2, Flags);
3469 MI.eraseFromParent();
3477 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3480 MI.eraseFromParent();
3484 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3485 .addUse(ScaledInput)
3488 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3489 auto Zero =
B.buildFConstant(Ty, 0.0);
3491 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3492 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3494 MI.eraseFromParent();
3500 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3501 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3506 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3507 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3512 unsigned Flags =
MI.getFlags();
3513 const LLT Ty =
MRI.getType(
X);
3525 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3527 B.buildFPTrunc(Dst, LogVal);
3532 MI.eraseFromParent();
3541 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3546 const float c_log10 = 0x1.344134p-2f;
3547 const float cc_log10 = 0x1.09f79ep-26f;
3550 const float c_log = 0x1.62e42ep-1f;
3551 const float cc_log = 0x1.efa39ep-25f;
3553 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3554 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3556 R =
B.buildFMul(Ty,
Y,
C, Flags).getReg(0);
3557 auto NegR =
B.buildFNeg(Ty, R, Flags);
3558 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, Flags);
3559 auto FMA1 =
B.buildFMA(Ty,
Y, CC, FMA0, Flags);
3560 R =
B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3563 const float ch_log10 = 0x1.344000p-2f;
3564 const float ct_log10 = 0x1.3509f6p-18f;
3567 const float ch_log = 0x1.62e000p-1f;
3568 const float ct_log = 0x1.0bfbe8p-15f;
3570 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3571 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3573 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3574 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3575 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3576 auto YTCT =
B.buildFMul(Ty, YT, CT, Flags);
3579 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3581 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, Flags);
3584 const bool IsFiniteOnly =
3588 if (!IsFiniteOnly) {
3591 auto Fabs =
B.buildFAbs(Ty,
Y);
3594 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3598 auto Zero =
B.buildFConstant(Ty, 0.0);
3600 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3601 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3602 B.buildFSub(Dst, R, Shift, Flags);
3604 B.buildCopy(Dst, R);
3607 MI.eraseFromParent();
3613 unsigned Flags)
const {
3614 const double Log2BaseInverted =
3617 LLT Ty =
B.getMRI()->getType(Dst);
3622 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3625 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3626 auto Zero =
B.buildFConstant(Ty, 0.0);
3628 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3629 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3632 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3634 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3635 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3643 ?
B.buildFLog2(Ty, Src, Flags)
3644 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3647 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3648 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3659 unsigned Flags =
MI.getFlags();
3660 LLT Ty =
B.getMRI()->getType(Dst);
3666 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3667 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3668 .addUse(Ext.getReg(0))
3670 B.buildFPTrunc(Dst,
Log2, Flags);
3671 MI.eraseFromParent();
3681 MI.eraseFromParent();
3689 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3691 RangeCheckConst, Flags);
3693 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3694 auto Zero =
B.buildFConstant(Ty, 0.0);
3695 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3696 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3698 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3699 .addUse(AddInput.getReg(0))
3702 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3703 auto One =
B.buildFConstant(Ty, 1.0);
3704 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3705 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3706 MI.eraseFromParent();
3712 LLT Ty =
B.getMRI()->getType(Dst);
3717 auto Mul =
B.buildFMul(Ty,
X, Log2E, Flags);
3721 .addUse(
Mul.getReg(0))
3724 B.buildFExp2(Dst,
Mul.getReg(0), Flags);
3730 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3733 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3734 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3735 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3738 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3740 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3741 .addUse(ExpInput.getReg(0))
3744 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3745 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3746 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3754 const unsigned Flags =
MI.getFlags();
3757 LLT Ty =
MRI.getType(Dst);
3760 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
3767 MI.eraseFromParent();
3775 auto Ext =
B.buildFPExt(
F32,
X, Flags);
3778 B.buildFPTrunc(Dst, Lowered, Flags);
3779 MI.eraseFromParent();
3789 MI.eraseFromParent();
3817 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
3822 const float cc_exp = 0x1.4ae0bep-26f;
3823 const float c_exp10 = 0x1.a934f0p+1f;
3824 const float cc_exp10 = 0x1.2f346ep-24f;
3826 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3827 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
3828 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
3829 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
3831 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3832 PL =
B.buildFMA(Ty,
X, CC, FMA0, Flags).getReg(0);
3834 const float ch_exp = 0x1.714000p+0f;
3835 const float cl_exp = 0x1.47652ap-12f;
3837 const float ch_exp10 = 0x1.a92000p+1f;
3838 const float cl_exp10 = 0x1.4f0978p-11f;
3840 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3841 auto XH =
B.buildAnd(Ty,
X, MaskConst);
3842 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
3844 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3845 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
3847 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3848 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
3851 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
3852 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3855 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
3858 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
3859 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
3862 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3863 .addUse(
A.getReg(0))
3865 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
3867 auto UnderflowCheckConst =
3868 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3869 auto Zero =
B.buildFConstant(Ty, 0.0);
3873 R =
B.buildSelect(Ty, Underflow, Zero, R);
3878 auto OverflowCheckConst =
3879 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3884 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
3887 B.buildCopy(Dst, R);
3888 MI.eraseFromParent();
3897 unsigned Flags =
MI.getFlags();
3898 LLT Ty =
B.getMRI()->getType(Dst);
3903 auto Log =
B.buildFLog2(
F32, Src0, Flags);
3904 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3905 .addUse(Log.getReg(0))
3908 B.buildFExp2(Dst,
Mul, Flags);
3909 }
else if (Ty == F16) {
3911 auto Log =
B.buildFLog2(F16, Src0, Flags);
3912 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
3913 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
3914 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3915 .addUse(Ext0.getReg(0))
3916 .addUse(Ext1.getReg(0))
3918 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
3922 MI.eraseFromParent();
3930 ModSrc = SrcFNeg->getOperand(1).getReg();
3932 ModSrc = SrcFAbs->getOperand(1).getReg();
3934 ModSrc = SrcFAbs->getOperand(1).getReg();
3945 Register OrigSrc =
MI.getOperand(1).getReg();
3946 unsigned Flags =
MI.getFlags();
3948 "this should not have been custom lowered");
3958 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
3970 B.buildFConstant(
F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3978 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3980 B.buildFMinNum(Min, Fract, Const, Flags);
3985 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
3988 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
3989 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3991 MI.eraseFromParent();
4007 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4009 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
4010 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
4013 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
4014 B.buildBitcast(Dst,
Merge);
4016 MI.eraseFromParent();
4033 bool UsePartialMad64_32,
4034 bool SeparateOddAlignedProducts)
const {
4049 auto getZero32 = [&]() ->
Register {
4051 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
4054 auto getZero64 = [&]() ->
Register {
4056 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
4061 for (
unsigned i = 0; i < Src0.
size(); ++i) {
4072 if (CarryIn.empty())
4075 bool HaveCarryOut =
true;
4077 if (CarryIn.size() == 1) {
4079 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4083 CarryAccum = getZero32();
4085 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4086 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4088 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
4093 LocalAccum = getZero32();
4094 HaveCarryOut =
false;
4099 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
4100 LocalAccum =
Add.getReg(0);
4114 auto buildMadChain =
4117 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
4118 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
4125 if (LocalAccum.size() == 1 &&
4126 (!UsePartialMad64_32 || !CarryIn.empty())) {
4129 unsigned j1 = DstIndex - j0;
4130 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4134 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
4136 LocalAccum[0] =
Mul.getReg(0);
4138 if (CarryIn.empty()) {
4139 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
4142 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4148 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4152 if (j0 <= DstIndex) {
4153 bool HaveSmallAccum =
false;
4156 if (LocalAccum[0]) {
4157 if (LocalAccum.size() == 1) {
4158 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4159 HaveSmallAccum =
true;
4160 }
else if (LocalAccum[1]) {
4161 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4162 HaveSmallAccum =
false;
4164 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4165 HaveSmallAccum =
true;
4168 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4170 HaveSmallAccum =
true;
4174 unsigned j1 = DstIndex - j0;
4175 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4179 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4180 {Src0[j0], Src1[j1], Tmp});
4181 Tmp = Mad.getReg(0);
4182 if (!HaveSmallAccum)
4183 CarryOut.push_back(Mad.getReg(1));
4184 HaveSmallAccum =
false;
4187 }
while (j0 <= DstIndex);
4189 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4190 LocalAccum[0] = Unmerge.getReg(0);
4191 if (LocalAccum.size() > 1)
4192 LocalAccum[1] = Unmerge.getReg(1);
4219 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4220 Carry OddCarryIn = std::move(OddCarry);
4221 Carry EvenCarryIn = std::move(EvenCarry);
4226 if (2 * i < Accum.
size()) {
4227 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4228 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4233 if (!SeparateOddAlignedProducts) {
4234 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4235 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4237 bool IsHighest = 2 * i >= Accum.
size();
4241 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4247 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4249 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4251 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4254 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4257 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4258 Lo->getOperand(1).getReg());
4259 Accum[2 * i] =
Hi.getReg(0);
4260 SeparateOddCarry =
Hi.getReg(1);
4267 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4268 EvenCarryIn.push_back(CarryOut);
4270 if (2 * i < Accum.
size()) {
4271 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4272 OddCarry.push_back(CarryOut);
4285 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4294 LLT Ty =
MRI.getType(DstReg);
4301 unsigned NumParts =
Size / 32;
4317 for (
unsigned i = 0; i < NumParts; ++i) {
4321 B.buildUnmerge(Src0Parts, Src0);
4322 B.buildUnmerge(Src1Parts, Src1);
4325 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4326 SeparateOddAlignedProducts);
4328 B.buildMergeLikeInstr(DstReg, AccumRegs);
4329 MI.eraseFromParent();
4341 LLT DstTy =
MRI.getType(Dst);
4342 LLT SrcTy =
MRI.getType(Src);
4344 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4345 ? AMDGPU::G_AMDGPU_FFBH_U32
4346 : AMDGPU::G_AMDGPU_FFBL_B32;
4347 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4350 MI.eraseFromParent();
4359 LLT SrcTy =
MRI.getType(Src);
4364 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4365 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4366 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4367 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4368 B.buildTrunc(Dst, Ctlz);
4369 MI.eraseFromParent();
4375 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4378 return ConstVal == -1;
4385 Register CondDef =
MI.getOperand(0).getReg();
4386 if (!
MRI.hasOneNonDBGUse(CondDef))
4394 if (!
MRI.hasOneNonDBGUse(NegatedCond))
4400 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
4409 if (Next == Parent->
end()) {
4413 UncondBrTarget = &*NextMBB;
4415 if (Next->getOpcode() != AMDGPU::G_BR)
4434 *ArgRC,
B.getDebugLoc(), ArgTy);
4438 const unsigned Mask = Arg->
getMask();
4439 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4446 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4447 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4450 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4452 B.buildCopy(DstReg, LiveIn);
4479 Arg = &WorkGroupIDX;
4480 ArgRC = &AMDGPU::SReg_32RegClass;
4484 Arg = &WorkGroupIDY;
4485 ArgRC = &AMDGPU::SReg_32RegClass;
4489 Arg = &WorkGroupIDZ;
4490 ArgRC = &AMDGPU::SReg_32RegClass;
4505 B.buildConstant(DstReg, 0);
4511 B.buildUndef(DstReg);
4515 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4527 MI.eraseFromParent();
4533 B.buildConstant(
MI.getOperand(0).getReg(),
C);
4534 MI.eraseFromParent();
4555 B.buildUndef(DstReg);
4556 MI.eraseFromParent();
4560 if (Arg->isMasked()) {
4574 MI.eraseFromParent();
4581 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
4590 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
4598 Align Alignment)
const {
4602 "unexpected kernarg parameter type");
4606 B.buildLoad(DstReg,
Ptr, PtrInfo,
Align(4),
4609 MI.eraseFromParent();
4617 LLT DstTy =
MRI.getType(Dst);
4644 auto FloatY =
B.buildUITOFP(
S32,
Y);
4645 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
4646 auto Scale =
B.buildFConstant(
S32, llvm::bit_cast<float>(0x4f7ffffe));
4647 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
4648 auto Z =
B.buildFPTOUI(
S32, ScaledY);
4651 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
4652 auto NegYZ =
B.buildMul(
S32, NegY, Z);
4653 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
4656 auto Q =
B.buildUMulH(
S32,
X, Z);
4657 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
4660 auto One =
B.buildConstant(
S32, 1);
4663 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
4669 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
4672 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
4691 auto Unmerge =
B.buildUnmerge(
S32, Val);
4693 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
4694 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
4696 auto Mad =
B.buildFMAD(
4698 B.buildFConstant(
S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4700 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
4701 auto Mul1 =
B.buildFMul(
4702 S32, Rcp,
B.buildFConstant(
S32, llvm::bit_cast<float>(0x5f7ffffc)));
4705 auto Mul2 =
B.buildFMul(
4706 S32, Mul1,
B.buildFConstant(
S32, llvm::bit_cast<float>(0x2f800000)));
4707 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
4710 auto Mad2 =
B.buildFMAD(
4711 S32, Trunc,
B.buildFConstant(
S32, llvm::bit_cast<float>(0xcf800000)),
4714 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
4715 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
4717 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4732 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
4734 auto Zero64 =
B.buildConstant(
S64, 0);
4735 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
4737 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
4738 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
4740 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
4741 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4742 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4744 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
4745 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4746 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
4748 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
4749 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
4750 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
4751 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4752 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4754 auto Zero32 =
B.buildConstant(
S32, 0);
4755 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
4756 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4757 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
4759 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
4760 Register NumerLo = UnmergeNumer.getReg(0);
4761 Register NumerHi = UnmergeNumer.getReg(1);
4763 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
4764 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
4765 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
4766 Register Mul3_Lo = UnmergeMul3.getReg(0);
4767 Register Mul3_Hi = UnmergeMul3.getReg(1);
4768 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
4769 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4770 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
4771 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
4773 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
4774 Register DenomLo = UnmergeDenom.getReg(0);
4775 Register DenomHi = UnmergeDenom.getReg(1);
4778 auto C1 =
B.buildSExt(
S32, CmpHi);
4781 auto C2 =
B.buildSExt(
S32, CmpLo);
4784 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
4791 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
4792 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4793 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4794 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
4796 auto One64 =
B.buildConstant(
S64, 1);
4797 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
4803 auto C6 =
B.buildSelect(
4807 auto Add4 =
B.buildAdd(
S64, Add3, One64);
4808 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
4810 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4811 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4812 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
4818 auto Sel1 =
B.buildSelect(
4825 auto Sel2 =
B.buildSelect(
4836 switch (
MI.getOpcode()) {
4839 case AMDGPU::G_UDIV: {
4840 DstDivReg =
MI.getOperand(0).getReg();
4843 case AMDGPU::G_UREM: {
4844 DstRemReg =
MI.getOperand(0).getReg();
4847 case AMDGPU::G_UDIVREM: {
4848 DstDivReg =
MI.getOperand(0).getReg();
4849 DstRemReg =
MI.getOperand(1).getReg();
4856 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4857 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
4858 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
4859 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
4868 MI.eraseFromParent();
4878 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
4879 if (Ty !=
S32 && Ty !=
S64)
4882 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4887 auto LHSign =
B.buildAShr(Ty,
LHS, SignBitOffset);
4888 auto RHSign =
B.buildAShr(Ty,
RHS, SignBitOffset);
4890 LHS =
B.buildAdd(Ty,
LHS, LHSign).getReg(0);
4891 RHS =
B.buildAdd(Ty,
RHS, RHSign).getReg(0);
4893 LHS =
B.buildXor(Ty,
LHS, LHSign).getReg(0);
4894 RHS =
B.buildXor(Ty,
RHS, RHSign).getReg(0);
4896 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4897 switch (
MI.getOpcode()) {
4900 case AMDGPU::G_SDIV: {
4901 DstDivReg =
MI.getOperand(0).getReg();
4902 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
4905 case AMDGPU::G_SREM: {
4906 DstRemReg =
MI.getOperand(0).getReg();
4907 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
4910 case AMDGPU::G_SDIVREM: {
4911 DstDivReg =
MI.getOperand(0).getReg();
4912 DstRemReg =
MI.getOperand(1).getReg();
4913 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
4914 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
4925 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
4926 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4927 B.buildSub(DstDivReg, SignXor, Sign);
4931 auto Sign = LHSign.getReg(0);
4932 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4933 B.buildSub(DstRemReg, SignXor, Sign);
4936 MI.eraseFromParent();
4947 LLT ResTy =
MRI.getType(Res);
4952 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
4963 if (CLHS->isExactlyValue(1.0)) {
4964 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4968 MI.eraseFromParent();
4973 if (CLHS->isExactlyValue(-1.0)) {
4974 auto FNeg =
B.buildFNeg(ResTy,
RHS, Flags);
4975 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4976 .addUse(FNeg.getReg(0))
4979 MI.eraseFromParent();
4986 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
4991 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4994 B.buildFMul(Res,
LHS, RCP, Flags);
4996 MI.eraseFromParent();
5007 LLT ResTy =
MRI.getType(Res);
5011 if (!AllowInaccurateRcp)
5014 auto NegY =
B.buildFNeg(ResTy,
Y);
5015 auto One =
B.buildFConstant(ResTy, 1.0);
5017 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5021 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
5022 R =
B.buildFMA(ResTy, Tmp0, R, R);
5024 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
5025 R =
B.buildFMA(ResTy, Tmp1, R, R);
5027 auto Ret =
B.buildFMul(ResTy,
X, R);
5028 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
5030 B.buildFMA(Res, Tmp2, R, Ret);
5031 MI.eraseFromParent();
5063 auto LHSExt =
B.buildFPExt(
S32,
LHS, Flags);
5064 auto RHSExt =
B.buildFPExt(
S32,
RHS, Flags);
5065 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
5066 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5067 .addUse(RHSExt.getReg(0))
5069 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
5072 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5073 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
5074 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5076 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5077 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
5078 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5080 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
5081 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
5082 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
5083 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
5084 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5085 .addUse(RDst.getReg(0))
5090 MI.eraseFromParent();
5103 unsigned SPDenormMode =
5106 if (ST.hasDenormModeInst()) {
5108 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5110 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5111 B.buildInstr(AMDGPU::S_DENORM_MODE)
5112 .addImm(NewDenormModeValue);
5115 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5116 .addImm(SPDenormMode)
5138 auto One =
B.buildFConstant(
S32, 1.0f);
5140 auto DenominatorScaled =
5141 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5146 auto NumeratorScaled =
5147 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5153 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5154 .addUse(DenominatorScaled.getReg(0))
5156 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5159 const bool HasDynamicDenormals =
5164 if (!PreservesDenormals) {
5165 if (HasDynamicDenormals) {
5166 SavedSPDenormMode =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5167 B.buildInstr(AMDGPU::S_GETREG_B32)
5168 .addDef(SavedSPDenormMode)
5174 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5175 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5176 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5177 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5178 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5179 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5181 if (!PreservesDenormals) {
5182 if (HasDynamicDenormals) {
5183 assert(SavedSPDenormMode);
5184 B.buildInstr(AMDGPU::S_SETREG_B32)
5185 .addReg(SavedSPDenormMode)
5191 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5192 .addUse(Fma4.getReg(0))
5193 .addUse(Fma1.getReg(0))
5194 .addUse(Fma3.getReg(0))
5195 .addUse(NumeratorScaled.getReg(1))
5198 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5199 .addUse(Fmas.getReg(0))
5204 MI.eraseFromParent();
5223 auto One =
B.buildFConstant(
S64, 1.0);
5225 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5231 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5233 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5234 .addUse(DivScale0.getReg(0))
5237 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5238 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5239 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5241 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5247 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5248 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5249 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5258 auto NumUnmerge =
B.buildUnmerge(
S32,
LHS);
5259 auto DenUnmerge =
B.buildUnmerge(
S32,
RHS);
5260 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5261 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5264 Scale1Unmerge.getReg(1));
5266 Scale0Unmerge.getReg(1));
5267 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5269 Scale = DivScale1.getReg(1);
5272 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5273 .addUse(Fma4.getReg(0))
5274 .addUse(Fma3.getReg(0))
5275 .addUse(
Mul.getReg(0))
5279 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5280 .addUse(Fmas.getReg(0))
5285 MI.eraseFromParent();
5297 LLT Ty =
MRI.getType(Res0);
5300 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5303 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5308 auto Fabs =
B.buildFAbs(Ty, Val);
5312 auto Zero =
B.buildConstant(InstrExpTy, 0);
5313 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5314 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5317 B.buildCopy(Res0, Mant);
5318 B.buildSExtOrTrunc(Res1, Exp);
5320 MI.eraseFromParent();
5335 auto Abs =
B.buildFAbs(
S32,
RHS, Flags);
5338 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5339 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5340 auto C2 =
B.buildFConstant(
S32, 1.0f);
5343 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5345 auto Mul0 =
B.buildFMul(
S32,
RHS, Sel, Flags);
5347 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5348 .addUse(Mul0.getReg(0))
5351 auto Mul1 =
B.buildFMul(
S32,
LHS, RCP, Flags);
5353 B.buildFMul(Res, Sel, Mul1, Flags);
5355 MI.eraseFromParent();
5364 unsigned Flags =
MI.getFlags();
5367 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5368 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5369 .addUse(Ext.getReg(0))
5371 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5372 MI.eraseFromParent();
5382 const unsigned Flags =
MI.getFlags();
5391 MI.eraseFromParent();
5395 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5397 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5398 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5399 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5404 .addUse(SqrtX.getReg(0))
5407 auto NegOne =
B.buildConstant(I32, -1);
5408 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5410 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5411 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5413 auto PosOne =
B.buildConstant(I32, 1);
5414 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5416 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5417 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5419 auto Zero =
B.buildFConstant(
F32, 0.0f);
5423 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5427 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5430 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5431 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5433 auto Half =
B.buildFConstant(
F32, 0.5f);
5434 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5435 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5436 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5437 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5438 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5439 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5440 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5441 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5444 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5446 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5448 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5451 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5453 MI.eraseFromParent();
5485 assert(
MRI.getType(Dst) ==
F64 &&
"only expect to lower f64 sqrt");
5488 unsigned Flags =
MI.getFlags();
5490 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
5492 auto ZeroInt =
B.buildConstant(
S32, 0);
5496 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
5497 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
5498 auto SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags);
5501 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX.getReg(0));
5503 auto Half =
B.buildFConstant(
F64, 0.5);
5504 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
5505 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
5507 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
5508 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
5510 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
5511 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
5513 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
5514 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
5516 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
5518 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
5519 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
5521 auto SqrtRet =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
5524 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
5525 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
5526 SqrtRet =
B.buildFLdexp(
F64, SqrtRet, ScaleDown, Flags);
5535 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5537 MI.eraseFromParent();
5544 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5568 auto Flags =
MI.getFlags();
5570 LLT Ty =
MRI.getType(Dst);
5580 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5590 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5591 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5596 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5598 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5599 MI.eraseFromParent();
5611 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5612 IID == Intrinsic::amdgcn_permlanex16;
5613 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5614 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5618 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
5620 case Intrinsic::amdgcn_readfirstlane:
5621 case Intrinsic::amdgcn_permlane64:
5622 return LaneOp.getReg(0);
5623 case Intrinsic::amdgcn_readlane:
5624 case Intrinsic::amdgcn_set_inactive:
5625 case Intrinsic::amdgcn_set_inactive_chain_arg:
5626 return LaneOp.addUse(Src1).getReg(0);
5627 case Intrinsic::amdgcn_writelane:
5628 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5629 case Intrinsic::amdgcn_permlane16:
5630 case Intrinsic::amdgcn_permlanex16: {
5632 int64_t Src4 =
MI.getOperand(6).getImm();
5633 int64_t Src5 =
MI.getOperand(7).getImm();
5634 return LaneOp.addUse(Src1)
5641 case Intrinsic::amdgcn_mov_dpp8:
5642 return LaneOp.addImm(
MI.getOperand(3).getImm()).
getReg(0);
5643 case Intrinsic::amdgcn_update_dpp:
5644 return LaneOp.addUse(Src1)
5645 .addImm(
MI.getOperand(4).getImm())
5646 .addImm(
MI.getOperand(5).getImm())
5647 .addImm(
MI.getOperand(6).getImm())
5648 .addImm(
MI.getOperand(7).getImm())
5658 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5659 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5660 Src1 =
MI.getOperand(3).getReg();
5661 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5662 Src2 =
MI.getOperand(4).getReg();
5666 LLT Ty =
MRI.getType(DstReg);
5669 unsigned SplitSize = 32;
5670 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
5675 if (
Size == SplitSize) {
5681 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
5683 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5686 if (IID == Intrinsic::amdgcn_writelane)
5689 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
5690 B.buildTrunc(DstReg, LaneOpDst);
5691 MI.eraseFromParent();
5695 if (
Size % SplitSize != 0)
5699 bool NeedsBitcast =
false;
5703 if (EltSize == SplitSize) {
5704 PartialResTy = EltTy;
5705 }
else if (EltSize == 16 || EltSize == 32) {
5706 unsigned NElem = SplitSize / EltSize;
5710 NeedsBitcast =
true;
5715 unsigned NumParts =
Size / SplitSize;
5719 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5720 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
5722 if (IID == Intrinsic::amdgcn_writelane)
5723 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
5725 for (
unsigned i = 0; i < NumParts; ++i) {
5726 Src0 = Src0Parts.
getReg(i);
5728 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5729 Src1 = Src1Parts.
getReg(i);
5731 if (IID == Intrinsic::amdgcn_writelane)
5732 Src2 = Src2Parts.
getReg(i);
5734 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5738 B.buildBitcast(DstReg,
B.buildMergeLikeInstr(
5741 B.buildMergeLikeInstr(DstReg, PartialRes);
5743 MI.eraseFromParent();
5753 LLT DstTy =
MRI.getType(DstReg);
5756 Register KernargPtrReg =
MRI.createGenericVirtualRegister(DstTy);
5761 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
5762 B.buildConstant(IdxTy,
Offset).getReg(0));
5773 Register Pointer =
MI.getOperand(2).getReg();
5775 Register NumRecords =
MI.getOperand(4).getReg();
5780 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
5781 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
5782 Register LowHalf = Unmerge.getReg(0);
5783 Register HighHalf = Unmerge.getReg(1);
5785 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
5786 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
5789 std::optional<ValueAndVReg> StrideConst =
5791 if (!StrideConst || !StrideConst->Value.isZero()) {
5794 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5795 uint32_t ShiftedStrideVal = StrideVal << 16;
5796 ShiftedStride =
B.buildConstant(
S32, ShiftedStrideVal);
5798 auto ExtStride =
B.buildAnyExt(
S32, Stride);
5799 auto ShiftConst =
B.buildConstant(
S32, 16);
5800 ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
5802 NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
5805 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5806 MI.eraseFromParent();
5823 MI.eraseFromParent();
5831 std::optional<uint32_t> KnownSize =
5833 if (KnownSize.has_value())
5834 B.buildConstant(DstReg, *KnownSize);
5852 MI.eraseFromParent();
5859 unsigned AddrSpace)
const {
5861 auto Unmerge =
B.buildUnmerge(
S32,
MI.getOperand(2).getReg());
5867 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
5868 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
5870 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
5872 Register XOR =
B.buildXor(
S32, Hi32, FlatScratchBaseHi).getReg(0);
5874 B.buildConstant(
S32, 1u << 26));
5879 MI.eraseFromParent();
5889std::pair<Register, unsigned>
5903 MRI, OrigOffset,
nullptr, CheckNUW);
5906 if (
MRI.getType(BaseReg).isPointer())
5907 BaseReg =
B.buildPtrToInt(
MRI.getType(OrigOffset), BaseReg).getReg(0);
5917 unsigned Overflow = ImmOffset & ~MaxImm;
5918 ImmOffset -= Overflow;
5919 if ((int32_t)Overflow < 0) {
5920 Overflow += ImmOffset;
5924 if (Overflow != 0) {
5926 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
5928 auto OverflowVal =
B.buildConstant(
S32, Overflow);
5929 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
5934 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
5936 return std::pair(BaseReg, ImmOffset);
5943 bool ImageStore)
const {
5946 LLT StoreVT =
MRI.getType(Reg);
5950 auto Unmerge =
B.buildUnmerge(
S16, Reg);
5953 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5954 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
5965 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
5967 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
5974 auto Unmerge =
B.buildUnmerge(
S16, Reg);
5975 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5977 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
5985 auto Unmerge =
B.buildUnmerge(
S32, Reg);
5986 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5988 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
6005 bool IsFormat)
const {
6007 LLT Ty =
MRI->getType(VData);
6017 VData =
B.buildBitcast(Ty, VData).getReg(0);
6038 bool IsFormat)
const {
6043 LLT Ty =
MRI.getType(VData);
6045 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6060 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6063 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6067 VIndex =
MI.getOperand(3).getReg();
6070 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6073 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6074 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6078 Format =
MI.getOperand(5 + OpOffset).getImm();
6082 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6088 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6089 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6090 }
else if (IsFormat) {
6091 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6092 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6096 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6099 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6102 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6107 auto MIB =
B.buildInstr(
Opc)
6118 MIB.addImm(AuxiliaryData)
6119 .addImm(HasVIndex ? -1 : 0)
6120 .addMemOperand(MMO);
6122 MI.eraseFromParent();
6128 unsigned ImmOffset,
unsigned Format,
6131 auto MIB =
B.buildInstr(
Opc)
6142 MIB.addImm(AuxiliaryData)
6143 .addImm(HasVIndex ? -1 : 0)
6144 .addMemOperand(MMO);
6150 bool IsTyped)
const {
6164 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
6165 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6167 StatusDst =
MI.getOperand(1).getReg();
6172 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6175 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6178 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6181 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6184 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6187 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6188 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6192 Format =
MI.getOperand(5 + OpOffset).getImm();
6196 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6199 LLT Ty =
MRI.getType(Dst);
6206 Dst =
MI.getOperand(0).getReg();
6207 B.setInsertPt(
B.getMBB(),
MI);
6214 Dst =
MI.getOperand(0).getReg();
6215 B.setInsertPt(
B.getMBB(),
MI);
6219 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6230 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6231 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6232 }
else if (IsFormat) {
6236 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6238 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6239 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6244 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6245 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6248 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6249 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6252 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6253 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6260 unsigned NumLoadDWords = NumValueDWords + 1;
6262 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6264 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6266 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6267 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6268 B.buildTrunc(Dst, ExtDst);
6269 }
else if (NumValueDWords == 1) {
6270 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6273 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6274 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6276 B.buildUnmerge(LoadElts, LoadDstReg);
6278 B.buildMergeLikeInstr(Dst, LoadElts);
6282 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6284 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6285 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6286 B.buildTrunc(Dst, LoadDstReg);
6287 }
else if (Unpacked && IsD16 && Ty.
isVector()) {
6289 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6291 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6292 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6294 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6296 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6297 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6298 B.buildMergeLikeInstr(Dst, Repack);
6301 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6304 MI.eraseFromParent();
6310 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6311 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6312 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6313 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6314 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6315 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6316 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6317 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6318 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6319 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6320 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6321 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6322 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6323 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6324 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6325 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6326 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6327 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6328 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6329 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6330 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6331 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6332 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6333 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6334 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6335 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6336 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6337 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6338 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6339 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6340 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6341 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6342 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6343 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6344 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6345 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6346 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6347 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6348 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6349 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6350 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6351 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6352 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6353 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6354 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6355 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6356 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6357 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6358 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6359 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6360 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6361 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6362 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6363 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6364 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6365 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6366 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6367 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6368 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6369 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6370 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6371 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6372 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6373 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6374 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6375 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6376 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6377 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6378 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6379 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6380 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6381 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6382 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6383 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6384 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6385 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6386 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6387 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6388 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6389 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6390 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6391 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6392 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6401 const bool IsCmpSwap =
6402 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6403 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6404 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6405 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6416 CmpVal =
MI.getOperand(3).getReg();
6421 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6422 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6425 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6428 VIndex =
MI.getOperand(4 + OpOffset).getReg();
6431 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
6434 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
6435 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
6436 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
6455 .addImm(AuxiliaryData)
6456 .addImm(HasVIndex ? -1 : 0)
6457 .addMemOperand(MMO);
6459 MI.eraseFromParent();
6469 bool IsA16,
bool IsG16) {
6472 auto EndIdx =
Intr->VAddrEnd;
6474 for (
unsigned I =
Intr->VAddrStart;
I < EndIdx;
I++) {
6481 if ((I < Intr->GradientStart) ||
6482 (
I >=
Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6483 (
I >=
Intr->CoordStart && !IsA16)) {
6484 if ((I < Intr->GradientStart) && IsA16 &&
6485 (
B.getMRI()->getType(AddrReg) ==
S16)) {
6486 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
6490 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6494 "Bias needs to be converted to 16 bit in A16 mode");
6496 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
6502 if (((
I + 1) >= EndIdx) ||
6503 ((
Intr->NumGradients / 2) % 2 == 1 &&
6504 (
I ==
static_cast<unsigned>(
Intr->GradientStart +
6505 (
Intr->NumGradients / 2) - 1) ||
6506 I ==
static_cast<unsigned>(
Intr->GradientStart +
6507 Intr->NumGradients - 1))) ||
6509 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
6511 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6516 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6527 int DimIdx,
int NumVAddrs) {
6531 for (
int I = 0;
I != NumVAddrs; ++
I) {
6533 if (
SrcOp.isReg()) {
6539 int NumAddrRegs = AddrRegs.
size();
6540 if (NumAddrRegs != 1) {
6543 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6546 for (
int I = 1;
I != NumVAddrs; ++
I) {
6549 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
6571 const unsigned NumDefs =
MI.getNumExplicitDefs();
6572 const unsigned ArgOffset = NumDefs + 1;
6573 bool IsTFE = NumDefs == 2;
6591 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6592 Ty =
MRI->getType(VData);
6595 const bool IsAtomicPacked16Bit =
6596 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6597 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6601 MRI->getType(
MI.getOperand(ArgOffset +
Intr->GradientStart).getReg());
6603 MRI->getType(
MI.getOperand(ArgOffset +
Intr->CoordStart).getReg());
6606 const bool IsA16 = AddrTy ==
S16;
6610 if (!BaseOpcode->
Atomic) {
6611 DMask =
MI.getOperand(ArgOffset +
Intr->DMaskIndex).getImm();
6614 }
else if (DMask != 0) {
6616 }
else if (!IsTFE && !BaseOpcode->
Store) {
6618 B.buildUndef(
MI.getOperand(0));
6619 MI.eraseFromParent();
6627 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6628 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6629 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6630 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6631 unsigned NewOpcode = LoadOpcode;
6632 if (BaseOpcode->
Store)
6633 NewOpcode = StoreOpcode;
6635 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6638 MI.setDesc(
B.getTII().get(NewOpcode));
6642 if (IsTFE && DMask == 0) {
6645 MI.getOperand(ArgOffset +
Intr->DMaskIndex).setImm(DMask);
6648 if (BaseOpcode->
Atomic) {
6650 LLT Ty =
MRI->getType(VData0);
6653 if (Ty.
isVector() && !IsAtomicPacked16Bit)
6660 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
6661 MI.getOperand(2).setReg(
Concat.getReg(0));
6662 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6666 unsigned CorrectedNumVAddrs =
Intr->NumVAddrs;
6675 if (IsA16 && !ST.
hasA16()) {
6683 if (IsA16 || IsG16) {
6693 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
6694 const bool UsePartialNSA =
6695 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
6697 if (UsePartialNSA) {
6701 auto Concat =
B.buildConcatVectors(
6702 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6703 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
6704 PackedRegs.
resize(NSAMaxSize);
6705 }
else if (!UseNSA && PackedRegs.
size() > 1) {
6707 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
6708 PackedRegs[0] =
Concat.getReg(0);
6712 const unsigned NumPacked = PackedRegs.
size();
6713 for (
unsigned I =
Intr->VAddrStart; I < Intr->VAddrEnd;
I++) {
6715 if (!
SrcOp.isReg()) {
6722 if (
I -
Intr->VAddrStart < NumPacked)
6723 SrcOp.setReg(PackedRegs[
I -
Intr->VAddrStart]);
6725 SrcOp.setReg(AMDGPU::NoRegister);
6744 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6745 const bool UsePartialNSA =
6746 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6748 if (UsePartialNSA) {
6750 ArgOffset +
Intr->VAddrStart + NSAMaxSize - 1,
6751 Intr->NumVAddrs - NSAMaxSize + 1);
6752 }
else if (!UseNSA &&
Intr->NumVAddrs > 1) {
6771 if (RepackedReg != VData) {
6772 MI.getOperand(1).setReg(RepackedReg);
6783 if (NumElts < DMaskLanes)
6786 if (NumElts > 4 || DMaskLanes > 4)
6796 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6797 const LLT AdjustedTy =
6820 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
6821 unsigned RoundedSize = 32 * RoundedElts;
6825 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
6830 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
6836 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
6840 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6841 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
6843 Register NewResultReg =
MRI->createGenericVirtualRegister(LoadResultTy);
6845 MI.getOperand(0).setReg(NewResultReg);
6853 Dst1Reg =
MI.getOperand(1).getReg();
6854 if (
MRI->getType(Dst1Reg) !=
S32)
6858 MI.removeOperand(1);
6862 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6871 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6873 if (ResultNumRegs == 1) {
6875 ResultRegs[0] = NewResultReg;
6878 for (
int I = 0;
I != NumDataRegs; ++
I)
6879 ResultRegs[
I] =
MRI->createGenericVirtualRegister(RegTy);
6880 B.buildUnmerge(ResultRegs, NewResultReg);
6885 ResultRegs.
resize(NumDataRegs);
6891 B.buildTrunc(DstReg, ResultRegs[0]);
6897 B.buildBitcast(DstReg, ResultRegs[0]);
6911 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
6914 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
6918 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
6921 Register Undef =
B.buildUndef(Ty).getReg(0);
6922 for (
int I = 0;
I != NumElts; ++
I)
6927 LLT ResTy =
MRI->getType(ResultRegs[0]);
6929 padWithUndef(ResTy, NumElts - ResultRegs.
size());
6930 B.buildBuildVector(DstReg, ResultRegs);
6941 if (ResultRegs.
size() == 1) {
6942 NewResultReg = ResultRegs[0];
6943 }
else if (ResultRegs.
size() == 2) {
6945 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
6951 if (
MRI->getType(DstReg).getNumElements() <
6952 MRI->getType(NewResultReg).getNumElements()) {
6953 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6955 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6960 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
6961 B.buildConcatVectors(DstReg, ResultRegs);
6970 Register OrigDst =
MI.getOperand(0).getReg();
6972 LLT Ty =
B.getMRI()->getType(OrigDst);
6978 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6979 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6982 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
6984 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6993 B.setInsertPt(
B.getMBB(),
MI);
6998 B.setInsertPt(
B.getMBB(),
MI);
7004 MI.setDesc(
B.getTII().get(
Opc));
7005 MI.removeOperand(1);
7008 const unsigned MemSize = (
Size + 7) / 8;
7009 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
7016 MI.addMemOperand(MF, MMO);
7017 if (Dst != OrigDst) {
7018 MI.getOperand(0).setReg(Dst);
7019 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
7020 B.buildTrunc(OrigDst, Dst);
7042 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7043 MI.removeOperand(0);
7070 MI.eraseFromParent();
7080 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
7082 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7086 MI.eraseFromParent();
7095 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7104 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
7120 Register LoadAddr =
MRI.createGenericVirtualRegister(
7122 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7125 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
7126 B.buildCopy(SGPR01, Temp);
7127 B.buildInstr(AMDGPU::S_TRAP)
7130 MI.eraseFromParent();
7141 B.buildCopy(SGPR01, LiveIn);
7142 B.buildInstr(AMDGPU::S_TRAP)
7146 MI.eraseFromParent();
7158 MI.eraseFromParent();
7162 B.buildInstr(AMDGPU::S_TRAP)
7164 MI.eraseFromParent();
7177 Fn,
"debugtrap handler not supported",
MI.getDebugLoc(),
DS_Warning));
7180 B.buildInstr(AMDGPU::S_TRAP)
7184 MI.eraseFromParent();
7197 Register NodePtr =
MI.getOperand(2).getReg();
7198 Register RayExtent =
MI.getOperand(3).getReg();
7199 Register RayOrigin =
MI.getOperand(4).getReg();
7201 Register RayInvDir =
MI.getOperand(6).getReg();
7207 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7214 const bool IsA16 =
MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7215 const bool Is64 =
MRI.getType(NodePtr).getSizeInBits() == 64;
7216 const unsigned NumVDataDwords = 4;
7217 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7218 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7222 const unsigned BaseOpcodes[2][2] = {
7223 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7224 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7225 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7229 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7230 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7231 : AMDGPU::MIMGEncGfx10NSA,
7232 NumVDataDwords, NumVAddrDwords);
7236 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7237 : AMDGPU::MIMGEncGfx10Default,
7238 NumVDataDwords, NumVAddrDwords);
7243 if (UseNSA && IsGFX11Plus) {
7245 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7246 auto Merged =
B.buildMergeLikeInstr(
7247 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7253 packLanes(RayOrigin);
7256 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7257 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7258 auto MergedDir =
B.buildMergeLikeInstr(
7261 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7262 UnmergeRayDir.getReg(0)}))
7265 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7266 UnmergeRayDir.getReg(1)}))
7269 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7270 UnmergeRayDir.getReg(2)}))
7275 packLanes(RayInvDir);
7279 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7288 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7294 packLanes(RayOrigin);
7296 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7297 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7301 B.buildMergeLikeInstr(R1,
7302 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7303 B.buildMergeLikeInstr(
7304 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7305 B.buildMergeLikeInstr(
7306 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7312 packLanes(RayInvDir);
7319 Register MergedOps =
B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7324 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7333 .addImm(IsA16 ? 1 : 0)
7336 MI.eraseFromParent();
7346 Register DstOrigin =
MI.getOperand(1).getReg();
7348 Register NodePtr =
MI.getOperand(4).getReg();
7349 Register RayExtent =
MI.getOperand(5).getReg();
7350 Register InstanceMask =
MI.getOperand(6).getReg();
7351 Register RayOrigin =
MI.getOperand(7).getReg();
7353 Register Offsets =
MI.getOperand(9).getReg();
7354 Register TDescr =
MI.getOperand(10).getReg();
7359 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7363 bool IsBVH8 = cast<GIntrinsic>(
MI).getIntrinsicID() ==
7364 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7365 const unsigned NumVDataDwords = 10;
7366 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7368 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7369 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7370 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7373 auto RayExtentInstanceMaskVec =
B.buildMergeLikeInstr(
7374 V2S32, {RayExtent,
B.buildAnyExt(
S32, InstanceMask)});
7376 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7377 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7383 .addUse(RayExtentInstanceMaskVec.getReg(0))
7390 MI.eraseFromParent();
7399 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7400 MI.eraseFromParent();
7411 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
7412 auto LSB =
B.buildConstant(
S32, 25);
7413 auto Width =
B.buildConstant(
S32, 5);
7414 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7415 MI.eraseFromParent();
7429 if (
MRI.getType(Src) !=
S64)
7433 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7437 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7440 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7441 MI.eraseFromParent();
7449 if (
MRI.getType(Src) !=
S64)
7452 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
7456 .addReg(Unmerge.getReg(0));
7460 .addReg(Unmerge.getReg(1));
7461 MI.eraseFromParent();
7471 auto IntrID = cast<GIntrinsic>(
MI).getIntrinsicID();
7473 case Intrinsic::amdgcn_if:
7474 case Intrinsic::amdgcn_else: {
7477 bool Negated =
false;
7489 std::swap(CondBrTarget, UncondBrTarget);
7491 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7492 if (IntrID == Intrinsic::amdgcn_if) {
7493 B.buildInstr(AMDGPU::SI_IF)
7496 .addMBB(UncondBrTarget);
7498 B.buildInstr(AMDGPU::SI_ELSE)
7501 .addMBB(UncondBrTarget);
7510 B.buildBr(*CondBrTarget);
7513 MRI.setRegClass(Def,
TRI->getWaveMaskRegClass());
7514 MRI.setRegClass(
Use,
TRI->getWaveMaskRegClass());
7515 MI.eraseFromParent();
7516 BrCond->eraseFromParent();
7522 case Intrinsic::amdgcn_loop: {
7525 bool Negated =
false;
7535 std::swap(CondBrTarget, UncondBrTarget);
7537 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7538 B.buildInstr(AMDGPU::SI_LOOP)
7540 .addMBB(UncondBrTarget);
7545 B.buildBr(*CondBrTarget);
7547 MI.eraseFromParent();
7548 BrCond->eraseFromParent();
7549 MRI.setRegClass(Reg,
TRI->getWaveMaskRegClass());
7555 case Intrinsic::amdgcn_addrspacecast_nonnull:
7557 case Intrinsic::amdgcn_make_buffer_rsrc:
7559 case Intrinsic::amdgcn_kernarg_segment_ptr:
7562 B.buildConstant(
MI.getOperand(0).getReg(), 0);
7563 MI.eraseFromParent();
7569 case Intrinsic::amdgcn_implicitarg_ptr:
7571 case Intrinsic::amdgcn_workitem_id_x:
7574 case Intrinsic::amdgcn_workitem_id_y:
7577 case Intrinsic::amdgcn_workitem_id_z:
7580 case Intrinsic::amdgcn_workgroup_id_x:
7583 case Intrinsic::amdgcn_workgroup_id_y:
7586 case Intrinsic::amdgcn_workgroup_id_z:
7589 case Intrinsic::amdgcn_wave_id:
7591 case Intrinsic::amdgcn_lds_kernel_id:
7594 case Intrinsic::amdgcn_dispatch_ptr:
7597 case Intrinsic::amdgcn_queue_ptr:
7600 case Intrinsic::amdgcn_implicit_buffer_ptr:
7603 case Intrinsic::amdgcn_dispatch_id:
7606 case Intrinsic::r600_read_ngroups_x:
7610 case Intrinsic::r600_read_ngroups_y:
7613 case Intrinsic::r600_read_ngroups_z:
7616 case Intrinsic::r600_read_local_size_x:
7619 case Intrinsic::r600_read_local_size_y:
7623 case Intrinsic::r600_read_local_size_z:
7626 case Intrinsic::amdgcn_fdiv_fast:
7628 case Intrinsic::amdgcn_is_shared:
7630 case Intrinsic::amdgcn_is_private:
7632 case Intrinsic::amdgcn_wavefrontsize: {
7634 MI.eraseFromParent();
7637 case Intrinsic::amdgcn_s_buffer_load:
7639 case Intrinsic::amdgcn_raw_buffer_store:
7640 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7641 case Intrinsic::amdgcn_struct_buffer_store:
7642 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7644 case Intrinsic::amdgcn_raw_buffer_store_format:
7645 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7646 case Intrinsic::amdgcn_struct_buffer_store_format:
7647 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7649 case Intrinsic::amdgcn_raw_tbuffer_store:
7650 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7651 case Intrinsic::amdgcn_struct_tbuffer_store:
7652 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7654 case Intrinsic::amdgcn_raw_buffer_load:
7655 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7656 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7657 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7658 case Intrinsic::amdgcn_struct_buffer_load:
7659 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7660 case Intrinsic::amdgcn_struct_atomic_buffer_load:
7661 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7663 case Intrinsic::amdgcn_raw_buffer_load_format:
7664 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7665 case Intrinsic::amdgcn_struct_buffer_load_format:
7666 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7668 case Intrinsic::amdgcn_raw_tbuffer_load:
7669 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7670 case Intrinsic::amdgcn_struct_tbuffer_load:
7671 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7673 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7674 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7675 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7676 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7677 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7678 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7679 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7680 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7681 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7682 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7683 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7684 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7685 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7686 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7687 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7688 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7689 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7690 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7691 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7692 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7693 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7694 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7695 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7696 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7697 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7698 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7699 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7700 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7701 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7702 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7703 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7704 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7705 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7706 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7707 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7708 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7709 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7710 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7711 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7712 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7713 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7714 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7715 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7716 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7717 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7718 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7719 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7720 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7721 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7722 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7723 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7724 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7725 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7726 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7727 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7728 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7729 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7730 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7731 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7732 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7733 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7734 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7735 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7736 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7738 case Intrinsic::amdgcn_rsq_clamp:
7740 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7742 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
7743 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
7745 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
7746 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
7747 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
7748 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
7749 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
7750 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
7751 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
7752 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
7755 if (
MRI.getType(Index) !=
S64)
7756 MI.getOperand(5).setReg(
B.buildAnyExt(
S64, Index).getReg(0));
7759 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7760 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7761 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7762 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7763 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7764 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7765 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7766 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7769 if (
MRI.getType(Index) !=
S32)
7770 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
7773 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
7774 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
7775 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
7776 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
7777 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
7778 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
7779 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7780 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7781 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7783 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
7786 if (
MRI.getType(Index) != IdxTy)
7787 MI.getOperand(7).setReg(
B.buildAnyExt(IdxTy, Index).getReg(0));
7791 case Intrinsic::amdgcn_fmed3: {
7797 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7798 MI.removeOperand(1);
7802 case Intrinsic::amdgcn_readlane:
7803 case Intrinsic::amdgcn_writelane:
7804 case Intrinsic::amdgcn_readfirstlane:
7805 case Intrinsic::amdgcn_permlane16:
7806 case Intrinsic::amdgcn_permlanex16:
7807 case Intrinsic::amdgcn_permlane64:
7808 case Intrinsic::amdgcn_set_inactive:
7809 case Intrinsic::amdgcn_set_inactive_chain_arg:
7810 case Intrinsic::amdgcn_mov_dpp8:
7811 case Intrinsic::amdgcn_update_dpp:
7813 case Intrinsic::amdgcn_s_buffer_prefetch_data:
7815 case Intrinsic::amdgcn_dead: {
7819 MI.eraseFromParent();
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS32Vectors
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS64Vectors
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static constexpr unsigned MaxRegisterSize
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
Interface for Targets to specify which operations they can successfully select and how the others sho...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const SmallVectorImpl< MachineOperand > & Cond
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static constexpr int Concat[]
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isEntryFunction() const
bool isModuleEntryFunction() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasVOP3PInsts() const
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ ICMP_ULT
unsigned less than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This class represents an Operation in the Expression.
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasArchitectedSGPRs() const
bool hasPrivEnabledTrap2NopBug() const
const SIInstrInfo * getInstrInfo() const override
bool hasScalarSubwordLoads() const
bool supportsGetDoorbellID() const
bool hasBVHDualAndBVH8Insts() const
bool hasGloballyAddressableScratch() const
bool has64BitLiterals() const
TrapHandlerAbi getTrapHandlerAbi() const
bool hasGFX10_AEncoding() const
const SITargetLowering * getTargetLowering() const override
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasScalarSMulU64() const
bool hasNSAEncoding() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasScalarDwordx3Loads() const
bool hasVectorMulU64() const
Generation getGeneration() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasDPALU_DPP() const
bool hasAddNoCarry() const
bool hasPartialNSAEncoding() const
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
constexpr bool isPointerVector() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr ElementCount getElementCount() const
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr bool isPointerOrPointerVector() const
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
MutableArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
The instances of the Type class are immutable: once they are created, they are never changed.
A Use represents the edge between a Value definition and its users.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
constexpr ScalarTy getFixedValue() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX1250(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LLVM_ABI LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
int popcount(T Value) noexcept
Count the number of set bits in a value.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
std::function< bool(const LegalityQuery &)> LegalityPredicate
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.