35#include "llvm/IR/IntrinsicsAMDGPU.h"
36#include "llvm/IR/IntrinsicsR600.h"
38#define DEBUG_TYPE "amdgpu-legalinfo"
48 "amdgpu-global-isel-new-legality",
49 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
50 "rules compatible with selection patterns"),
65 unsigned Bits = Ty.getSizeInBits();
75 const LLT Ty = Query.Types[TypeIdx];
81 return Ty.getNumElements() % 2 != 0 &&
82 EltSize > 1 && EltSize < 32 &&
83 Ty.getSizeInBits() % 32 != 0;
89 const LLT Ty = Query.Types[TypeIdx];
96 const LLT Ty = Query.Types[TypeIdx];
98 return EltTy.
getSizeInBits() == 16 && Ty.getNumElements() > 2;
104 const LLT Ty = Query.Types[TypeIdx];
106 return std::pair(TypeIdx,
113 const LLT Ty = Query.Types[TypeIdx];
115 unsigned Size = Ty.getSizeInBits();
116 unsigned Pieces = (
Size + 63) / 64;
117 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
127 const LLT Ty = Query.Types[TypeIdx];
130 const int Size = Ty.getSizeInBits();
132 const int NextMul32 = (
Size + 31) / 32;
136 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
144 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
145 return std::make_pair(TypeIdx,
LLT::scalar(MemSize));
152 const LLT Ty = Query.Types[TypeIdx];
154 const unsigned EltSize = Ty.getElementType().getSizeInBits();
157 assert(EltSize == 32 || EltSize == 64);
162 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
166 return std::pair(TypeIdx,
181 const unsigned NumElems = Ty.getElementCount().getFixedValue();
186 const unsigned Size = Ty.getSizeInBits();
199 const LLT Ty = Query.Types[TypeIdx];
206 const LLT Ty = Query.Types[TypeIdx];
207 unsigned Size = Ty.getSizeInBits();
216 const LLT QueryTy = Query.Types[TypeIdx];
223 const LLT QueryTy = Query.Types[TypeIdx];
230 const LLT QueryTy = Query.Types[TypeIdx];
236 return ((ST.useRealTrue16Insts() &&
Size == 16) ||
Size % 32 == 0) &&
242 return EltSize == 16 || EltSize % 32 == 0;
246 const int EltSize = Ty.getElementType().getSizeInBits();
247 return EltSize == 32 || EltSize == 64 ||
248 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
249 EltSize == 128 || EltSize == 256;
278 LLT Ty = Query.Types[TypeIdx];
286 const LLT QueryTy = Query.Types[TypeIdx];
370 if (Ty.isPointerOrPointerVector())
371 Ty = Ty.changeElementType(
LLT::scalar(Ty.getScalarSizeInBits()));
375 (ST.useRealTrue16Insts() && Ty ==
S16) ||
390 const LLT Ty = Query.Types[TypeIdx];
391 return !Ty.
isVector() && Ty.getSizeInBits() > 32 &&
392 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
400 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
410 bool IsLoad,
bool IsAtomic) {
414 return ST.enableFlatScratch() ? 128 : 32;
416 return ST.useDS128() ? 128 : 64;
427 return IsLoad ? 512 : 128;
432 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
441 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
443 unsigned RegSize = Ty.getSizeInBits();
446 unsigned AS = Query.
Types[1].getAddressSpace();
453 if (Ty.isVector() && MemSize !=
RegSize)
460 if (IsLoad && MemSize <
Size)
461 MemSize = std::max(MemSize,
Align);
481 if (!ST.hasDwordx3LoadStores())
494 if (AlignBits < MemSize) {
497 Align(AlignBits / 8)))
527 const unsigned Size = Ty.getSizeInBits();
528 if (Ty.isPointerVector())
538 unsigned EltSize = Ty.getScalarSizeInBits();
539 return EltSize != 32 && EltSize != 64;
553 const unsigned Size = Ty.getSizeInBits();
554 if (
Size != MemSizeInBits)
555 return Size <= 32 && Ty.isVector();
561 return Ty.isVector() && (!MemTy.
isVector() || MemTy == Ty) &&
570 uint64_t AlignInBits,
unsigned AddrSpace,
580 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
591 if (AlignInBits < RoundedSize)
598 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
610 Query.
Types[1].getAddressSpace(), Opcode);
630 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
633 Register VectorReg =
MRI.createGenericVirtualRegister(VectorTy);
634 std::array<Register, 4> VectorElems;
635 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
636 for (
unsigned I = 0;
I < NumParts; ++
I)
638 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
639 B.buildMergeValues(MO, VectorElems);
643 Register BitcastReg =
MRI.createGenericVirtualRegister(VectorTy);
644 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
645 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
646 B.buildIntToPtr(MO, Scalar);
666 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
667 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
668 for (
unsigned I = 0;
I < NumParts; ++
I)
670 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
672 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
673 return B.buildBitcast(VectorTy, Scalar).getReg(0);
692 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
705 const LLT BufferStridedPtr =
708 const LLT CodePtr = FlatPtr;
710 const std::initializer_list<LLT> AddrSpaces64 = {
711 GlobalPtr, ConstantPtr, FlatPtr
714 const std::initializer_list<LLT> AddrSpaces32 = {
715 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
718 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
720 const std::initializer_list<LLT> FPTypesBase = {
724 const std::initializer_list<LLT> FPTypes16 = {
728 const std::initializer_list<LLT> FPTypesPK16 = {
732 const LLT MinScalarFPTy = ST.has16BitInsts() ?
S16 :
S32;
753 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
755 if (ST.hasScalarAddSub64()) {
758 .clampMaxNumElementsStrict(0,
S16, 2)
766 .clampMaxNumElementsStrict(0,
S16, 2)
773 if (ST.hasScalarSMulU64()) {
776 .clampMaxNumElementsStrict(0,
S16, 2)
784 .clampMaxNumElementsStrict(0,
S16, 2)
794 .minScalarOrElt(0,
S16)
799 }
else if (ST.has16BitInsts()) {
833 .widenScalarToNextMultipleOf(0, 32)
843 if (ST.hasMad64_32())
848 if (ST.hasIntClamp()) {
871 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
881 if (ST.hasVOP3PInsts()) {
883 .clampMaxNumElements(0,
S8, 2)
904 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
916 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
923 .clampScalar(0,
S16,
S64);
956 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
957 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
964 if (ST.has16BitInsts()) {
965 if (ST.hasVOP3PInsts())
968 FPOpActions.legalFor({
S16});
970 TrigActions.customFor({
S16});
971 FDIVActions.customFor({
S16});
974 if (ST.hasPackedFP32Ops()) {
975 FPOpActions.legalFor({
V2S32});
976 FPOpActions.clampMaxNumElementsStrict(0,
S32, 2);
980 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
983 if (ST.hasVOP3PInsts()) {
984 MinNumMaxNum.customFor(FPTypesPK16)
986 .clampMaxNumElements(0,
S16, 2)
989 }
else if (ST.has16BitInsts()) {
990 MinNumMaxNum.customFor(FPTypes16)
994 MinNumMaxNum.customFor(FPTypesBase)
999 if (ST.hasVOP3PInsts())
1015 .legalFor(FPTypesPK16)
1020 if (ST.has16BitInsts()) {
1049 if (ST.hasFractBug()) {
1078 if (ST.hasCvtPkF16F32Inst()) {
1080 .clampMaxNumElements(0,
S16, 2);
1084 FPTruncActions.scalarize(0).lower();
1092 if (ST.has16BitInsts()) {
1112 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1113 FMad.customFor({
S32,
S16});
1114 else if (ST.hasMadMacF32Insts())
1115 FMad.customFor({
S32});
1116 else if (ST.hasMadF16())
1117 FMad.customFor({
S16});
1122 if (ST.has16BitInsts()) {
1125 FRem.minScalar(0,
S32)
1134 .clampMaxNumElements(0,
S16, 2)
1153 if (ST.has16BitInsts())
1164 if (ST.has16BitInsts())
1175 .clampScalar(0,
S16,
S64)
1190 .clampScalar(0,
S16,
S64)
1194 if (ST.has16BitInsts()) {
1196 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1198 .clampScalar(0,
S16,
S64)
1202 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1204 .clampScalar(0,
S32,
S64)
1208 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1211 .clampScalar(0,
S32,
S64)
1223 .scalarSameSizeAs(1, 0)
1239 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1240 .legalForCartesianProduct(
1241 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1242 if (ST.has16BitInsts()) {
1243 CmpBuilder.legalFor({{
S1,
S16}});
1254 {
S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1256 if (ST.hasSALUFloatInsts())
1266 if (ST.has16BitInsts())
1267 ExpOps.customFor({{
S32}, {
S16}});
1269 ExpOps.customFor({
S32});
1270 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1279 if (ST.has16BitInsts())
1295 .clampScalar(0,
S32,
S32)
1302 if (ST.has16BitInsts())
1305 .widenScalarToNextPow2(1)
1311 .lowerFor({
S1,
S16})
1312 .widenScalarToNextPow2(1)
1339 .clampScalar(0,
S32,
S32)
1349 .clampScalar(0,
S32,
S64)
1353 if (ST.has16BitInsts()) {
1356 .clampMaxNumElementsStrict(0,
S16, 2)
1363 if (ST.hasVOP3PInsts()) {
1366 .clampMaxNumElements(0,
S16, 2)
1371 if (ST.hasIntMinMax64()) {
1374 .clampMaxNumElements(0,
S16, 2)
1382 .clampMaxNumElements(0,
S16, 2)
1391 .widenScalarToNextPow2(0)
1419 .legalForCartesianProduct(AddrSpaces32, {
S32})
1435 .legalForCartesianProduct(AddrSpaces32, {
S32})
1452 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1453 bool IsLoad) ->
bool {
1457 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1471 unsigned NumRegs = (MemSize + 31) / 32;
1473 if (!ST.hasDwordx3LoadStores())
1484 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1485 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1486 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1492 for (
unsigned Op : {G_LOAD, G_STORE}) {
1493 const bool IsStore =
Op == G_STORE;
1498 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1501 {
S64, GlobalPtr,
S64, GlobalAlign32},
1504 {
S32, GlobalPtr,
S8, GlobalAlign8},
1505 {
S32, GlobalPtr,
S16, GlobalAlign16},
1507 {
S32, LocalPtr,
S32, 32},
1508 {
S64, LocalPtr,
S64, 32},
1510 {
S32, LocalPtr,
S8, 8},
1511 {
S32, LocalPtr,
S16, 16},
1514 {
S32, PrivatePtr,
S32, 32},
1515 {
S32, PrivatePtr,
S8, 8},
1516 {
S32, PrivatePtr,
S16, 16},
1519 {
S32, ConstantPtr,
S32, GlobalAlign32},
1522 {
S64, ConstantPtr,
S64, GlobalAlign32},
1523 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1532 Actions.unsupportedIf(
1533 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1547 Actions.customIf(
typeIs(1, Constant32Ptr));
1573 return !Query.
Types[0].isVector() &&
1574 needToSplitMemOp(Query,
Op == G_LOAD);
1576 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1581 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1584 if (DstSize > MemSize)
1590 if (MemSize > MaxSize)
1598 return Query.
Types[0].isVector() &&
1599 needToSplitMemOp(Query,
Op == G_LOAD);
1601 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1615 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1616 if (MemSize > MaxSize) {
1620 if (MaxSize % EltSize == 0) {
1626 unsigned NumPieces = MemSize / MaxSize;
1630 if (NumPieces == 1 || NumPieces >= NumElts ||
1631 NumElts % NumPieces != 0)
1632 return std::pair(0, EltTy);
1640 return std::pair(0, EltTy);
1655 return std::pair(0, EltTy);
1660 .widenScalarToNextPow2(0)
1667 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1668 {
S32, GlobalPtr,
S16, 2 * 8},
1669 {
S32, LocalPtr,
S8, 8},
1670 {
S32, LocalPtr,
S16, 16},
1671 {
S32, PrivatePtr,
S8, 8},
1672 {
S32, PrivatePtr,
S16, 16},
1673 {
S32, ConstantPtr,
S8, 8},
1674 {
S32, ConstantPtr,
S16, 2 * 8}})
1680 if (ST.hasFlatAddressSpace()) {
1681 ExtLoads.legalForTypesWithMemDesc(
1682 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1697 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1698 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1699 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1700 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1701 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1702 {
S64, GlobalPtr}, {
S64, LocalPtr},
1703 {
S32, RegionPtr}, {
S64, RegionPtr}});
1704 if (ST.hasFlatAddressSpace()) {
1705 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1710 if (ST.hasLDSFPAtomicAddF32()) {
1711 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1712 if (ST.hasLdsAtomicAddF64())
1713 Atomic.legalFor({{
S64, LocalPtr}});
1714 if (ST.hasAtomicDsPkAdd16Insts())
1715 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1717 if (ST.hasAtomicFaddInsts())
1718 Atomic.legalFor({{
S32, GlobalPtr}});
1719 if (ST.hasFlatAtomicFaddF32Inst())
1720 Atomic.legalFor({{
S32, FlatPtr}});
1722 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1733 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1734 ST.hasAtomicBufferGlobalPkAddF16Insts())
1735 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1736 if (ST.hasAtomicGlobalPkAddBF16Inst())
1737 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1738 if (ST.hasAtomicFlatPkAdd16Insts())
1739 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1744 auto &AtomicFMinFMax =
1746 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1748 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1750 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1751 AtomicFMinFMax.
legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1752 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1754 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1761 {
S32, FlatPtr}, {
S64, FlatPtr}})
1762 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1763 {
S32, RegionPtr}, {
S64, RegionPtr}});
1769 LocalPtr, FlatPtr, PrivatePtr,
1773 .clampScalar(0,
S16,
S64)
1788 if (ST.has16BitInsts()) {
1789 if (ST.hasVOP3PInsts()) {
1791 .clampMaxNumElements(0,
S16, 2);
1793 Shifts.legalFor({{
S16,
S16}});
1796 Shifts.widenScalarIf(
1801 const LLT AmountTy = Query.
Types[1];
1802 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1806 Shifts.clampScalar(1,
S32,
S32);
1807 Shifts.widenScalarToNextPow2(0, 16);
1808 Shifts.clampScalar(0,
S16,
S64);
1818 Shifts.clampScalar(1,
S32,
S32);
1819 Shifts.widenScalarToNextPow2(0, 32);
1820 Shifts.clampScalar(0,
S32,
S64);
1829 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1830 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1831 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1832 unsigned IdxTypeIdx = 2;
1836 const LLT EltTy = Query.
Types[EltTypeIdx];
1837 const LLT VecTy = Query.
Types[VecTypeIdx];
1838 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1840 const bool isLegalVecType =
1850 return (EltSize == 32 || EltSize == 64) &&
1866 const LLT EltTy = Query.
Types[EltTypeIdx];
1867 const LLT VecTy = Query.
Types[VecTypeIdx];
1871 const unsigned TargetEltSize =
1872 DstEltSize % 64 == 0 ? 64 : 32;
1873 return std::pair(VecTypeIdx,
1877 .clampScalar(EltTypeIdx,
S32,
S64)
1891 const LLT &EltTy = Query.
Types[1].getElementType();
1892 return Query.
Types[0] != EltTy;
1895 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1896 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1897 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1906 const LLT BigTy = Query.
Types[BigTyIdx];
1911 const LLT BigTy = Query.
Types[BigTyIdx];
1912 const LLT LitTy = Query.
Types[LitTyIdx];
1918 const LLT BigTy = Query.
Types[BigTyIdx];
1924 const LLT LitTy = Query.
Types[LitTyIdx];
1943 if (ST.hasScalarPackInsts()) {
1946 .minScalarOrElt(0,
S16)
1953 BuildVector.customFor({
V2S16,
S16});
1954 BuildVector.minScalarOrElt(0,
S32);
1973 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1974 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1975 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
1977 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
1978 const LLT Ty = Query.
Types[TypeIdx];
1979 if (Ty.isVector()) {
1994 const LLT BigTy = Query.
Types[BigTyIdx];
2014 return notValidElt(Query, LitTyIdx);
2019 return notValidElt(Query, BigTyIdx);
2024 if (
Op == G_MERGE_VALUES) {
2025 Builder.widenScalarIf(
2028 const LLT Ty = Query.
Types[LitTyIdx];
2029 return Ty.getSizeInBits() < 32;
2036 const LLT Ty = Query.
Types[BigTyIdx];
2037 return Ty.getSizeInBits() % 16 != 0;
2042 const LLT &Ty = Query.
Types[BigTyIdx];
2043 unsigned NewSizeInBits = 1 <<
Log2_32_Ceil(Ty.getSizeInBits() + 1);
2044 if (NewSizeInBits >= 256) {
2045 unsigned RoundedTo =
alignTo<64>(Ty.getSizeInBits() + 1);
2046 if (RoundedTo < NewSizeInBits)
2047 NewSizeInBits = RoundedTo;
2049 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
2060 .clampScalar(0,
S32,
S64);
2062 if (ST.hasVOP3PInsts()) {
2063 SextInReg.lowerFor({{
V2S16}})
2067 .clampMaxNumElementsStrict(0,
S16, 2);
2068 }
else if (ST.has16BitInsts()) {
2069 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2073 SextInReg.lowerFor({{
S32}, {
S64}});
2086 FSHRActionDefs.legalFor({{
S32,
S32}})
2087 .clampMaxNumElementsStrict(0,
S16, 2);
2088 if (ST.hasVOP3PInsts())
2090 FSHRActionDefs.scalarize(0).lower();
2092 if (ST.hasVOP3PInsts()) {
2095 .clampMaxNumElementsStrict(0,
S16, 2)
2119 .clampScalar(1,
S32,
S32)
2128 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2129 G_READ_REGISTER, G_WRITE_REGISTER,
2134 if (ST.hasIEEEMinimumMaximumInsts()) {
2136 .legalFor(FPTypesPK16)
2150 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2151 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2157 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2158 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2159 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2160 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2166 verify(*ST.getInstrInfo());
2175 switch (
MI.getOpcode()) {
2176 case TargetOpcode::G_ADDRSPACE_CAST:
2178 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2180 case TargetOpcode::G_FCEIL:
2182 case TargetOpcode::G_FREM:
2184 case TargetOpcode::G_INTRINSIC_TRUNC:
2186 case TargetOpcode::G_SITOFP:
2188 case TargetOpcode::G_UITOFP:
2190 case TargetOpcode::G_FPTOSI:
2192 case TargetOpcode::G_FPTOUI:
2194 case TargetOpcode::G_FMINNUM:
2195 case TargetOpcode::G_FMAXNUM:
2196 case TargetOpcode::G_FMINIMUMNUM:
2197 case TargetOpcode::G_FMAXIMUMNUM:
2198 case TargetOpcode::G_FMINNUM_IEEE:
2199 case TargetOpcode::G_FMAXNUM_IEEE:
2201 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2203 case TargetOpcode::G_INSERT_VECTOR_ELT:
2205 case TargetOpcode::G_FSIN:
2206 case TargetOpcode::G_FCOS:
2208 case TargetOpcode::G_GLOBAL_VALUE:
2210 case TargetOpcode::G_LOAD:
2211 case TargetOpcode::G_SEXTLOAD:
2212 case TargetOpcode::G_ZEXTLOAD:
2214 case TargetOpcode::G_STORE:
2216 case TargetOpcode::G_FMAD:
2218 case TargetOpcode::G_FDIV:
2220 case TargetOpcode::G_FFREXP:
2222 case TargetOpcode::G_FSQRT:
2224 case TargetOpcode::G_UDIV:
2225 case TargetOpcode::G_UREM:
2226 case TargetOpcode::G_UDIVREM:
2228 case TargetOpcode::G_SDIV:
2229 case TargetOpcode::G_SREM:
2230 case TargetOpcode::G_SDIVREM:
2232 case TargetOpcode::G_ATOMIC_CMPXCHG:
2234 case TargetOpcode::G_FLOG2:
2236 case TargetOpcode::G_FLOG:
2237 case TargetOpcode::G_FLOG10:
2239 case TargetOpcode::G_FEXP2:
2241 case TargetOpcode::G_FEXP:
2242 case TargetOpcode::G_FEXP10:
2244 case TargetOpcode::G_FPOW:
2246 case TargetOpcode::G_FFLOOR:
2248 case TargetOpcode::G_BUILD_VECTOR:
2249 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2251 case TargetOpcode::G_MUL:
2253 case TargetOpcode::G_CTLZ:
2254 case TargetOpcode::G_CTTZ:
2256 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2258 case TargetOpcode::G_STACKSAVE:
2260 case TargetOpcode::G_GET_FPENV:
2262 case TargetOpcode::G_SET_FPENV:
2264 case TargetOpcode::G_TRAP:
2266 case TargetOpcode::G_DEBUGTRAP:
2286 if (ST.hasApertureRegs()) {
2291 ? AMDGPU::SRC_SHARED_BASE
2292 : AMDGPU::SRC_PRIVATE_BASE;
2293 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2294 !ST.hasGloballyAddressableScratch()) &&
2295 "Cannot use src_private_base with globally addressable scratch!");
2304 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2305 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {
Register(ApertureRegNo)});
2306 return B.buildUnmerge(
S32, Dst).getReg(1);
2311 Register LoadAddr =
MRI.createGenericVirtualRegister(
2321 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2323 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
2337 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2340 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2343 Register QueuePtr =
MRI.createGenericVirtualRegister(
2359 B.buildObjectPtrOffset(
2361 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2362 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2370 switch (Def->getOpcode()) {
2371 case AMDGPU::G_FRAME_INDEX:
2372 case AMDGPU::G_GLOBAL_VALUE:
2373 case AMDGPU::G_BLOCK_ADDR:
2375 case AMDGPU::G_CONSTANT: {
2376 const ConstantInt *CI = Def->getOperand(1).getCImm();
2377 return CI->
getSExtValue() != TM.getNullPointerValue(AddrSpace);
2393 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2395 Intrinsic::amdgcn_addrspacecast_nonnull));
2400 :
MI.getOperand(1).getReg();
2401 LLT DstTy =
MRI.getType(Dst);
2402 LLT SrcTy =
MRI.getType(Src);
2404 unsigned SrcAS = SrcTy.getAddressSpace();
2413 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2414 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2421 auto castFlatToLocalOrPrivate = [&](
const DstOp &Dst) ->
Register {
2423 ST.hasGloballyAddressableScratch()) {
2427 Register SrcLo =
B.buildExtract(
S32, Src, 0).getReg(0);
2429 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
2430 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2432 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2434 return B.buildIntToPtr(Dst,
Sub).getReg(0);
2438 return B.buildExtract(Dst, Src, 0).getReg(0);
2444 castFlatToLocalOrPrivate(Dst);
2445 MI.eraseFromParent();
2449 unsigned NullVal = TM.getNullPointerValue(DestAS);
2451 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2452 auto FlatNull =
B.buildConstant(SrcTy, 0);
2455 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2459 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2461 MI.eraseFromParent();
2468 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2471 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2474 ST.hasGloballyAddressableScratch()) {
2479 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {
S32})
2483 if (ST.isWave64()) {
2484 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {
S32})
2490 B.buildConstant(
S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2491 Register SrcHi =
B.buildShl(
S32, ThreadID, ShAmt).getReg(0);
2493 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).
getReg(0);
2497 B.buildInstr(AMDGPU::S_MOV_B64, {
S64},
2498 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2500 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2501 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2510 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2516 castLocalOrPrivateToFlat(Dst);
2517 MI.eraseFromParent();
2521 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2523 auto SegmentNull =
B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2524 auto FlatNull =
B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2527 SegmentNull.getReg(0));
2529 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2531 MI.eraseFromParent();
2536 SrcTy.getSizeInBits() == 64) {
2538 B.buildExtract(Dst, Src, 0);
2539 MI.eraseFromParent();
2546 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2547 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2548 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2549 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2550 MI.eraseFromParent();
2557 MI.eraseFromParent();
2565 LLT Ty =
MRI.getType(Src);
2566 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2571 auto C1 =
B.buildFConstant(Ty, C1Val);
2572 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2575 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2576 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2578 auto C2 =
B.buildFConstant(Ty, C2Val);
2579 auto Fabs =
B.buildFAbs(Ty, Src);
2582 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2583 MI.eraseFromParent();
2601 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2603 const auto Zero =
B.buildFConstant(
S64, 0.0);
2604 const auto One =
B.buildFConstant(
S64, 1.0);
2607 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2608 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2611 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2612 MI.eraseFromParent();
2620 Register Src0Reg =
MI.getOperand(1).getReg();
2621 Register Src1Reg =
MI.getOperand(2).getReg();
2622 auto Flags =
MI.getFlags();
2623 LLT Ty =
MRI.getType(DstReg);
2625 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2626 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2627 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2628 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2629 MI.eraseFromParent();
2635 const unsigned FractBits = 52;
2636 const unsigned ExpBits = 11;
2639 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2640 auto Const1 =
B.buildConstant(
S32, ExpBits);
2642 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2644 .addUse(Const0.getReg(0))
2645 .addUse(Const1.getReg(0));
2647 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2661 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2668 const unsigned FractBits = 52;
2671 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2672 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2674 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2676 const auto Zero32 =
B.buildConstant(
S32, 0);
2679 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2681 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2682 auto Not =
B.buildNot(
S64, Shr);
2683 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2684 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2689 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2690 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2691 MI.eraseFromParent();
2707 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2708 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2710 if (
MRI.getType(Dst) ==
S64) {
2711 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2712 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2714 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2715 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2718 B.buildFAdd(Dst, LdExp, CvtLo);
2719 MI.eraseFromParent();
2725 auto One =
B.buildConstant(
S32, 1);
2729 auto ThirtyOne =
B.buildConstant(
S32, 31);
2730 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2731 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2732 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2733 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2734 .addUse(Unmerge.getReg(1));
2735 auto LS2 =
B.buildSub(
S32, LS, One);
2736 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2738 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2739 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2740 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2741 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2742 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2743 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2744 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2745 B.buildFLdexp(Dst, FVal, Scale);
2746 MI.eraseFromParent();
2763 const LLT SrcLT =
MRI.getType(Src);
2766 unsigned Flags =
MI.getFlags();
2777 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2785 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2786 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2790 K0 =
B.buildFConstant(
2792 K1 =
B.buildFConstant(
2795 K0 =
B.buildFConstant(
2797 K1 =
B.buildFConstant(
2801 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2802 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2803 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2806 :
B.buildFPTOUI(
S32, FloorMul);
2807 auto Lo =
B.buildFPTOUI(
S32, Fma);
2811 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2813 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2816 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2817 MI.eraseFromParent();
2827 const bool IsIEEEOp =
MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2828 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2836 if (
MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
2837 MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
2859 LLT VecTy =
MRI.getType(Vec);
2872 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
2873 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
2874 B.buildIntToPtr(Dst, IntElt);
2876 MI.eraseFromParent();
2883 std::optional<ValueAndVReg> MaybeIdxVal =
2887 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2890 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2891 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2896 MI.eraseFromParent();
2911 LLT VecTy =
MRI.getType(Vec);
2925 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
2926 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
2927 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2929 B.buildIntToPtr(Dst, IntVecDest);
2930 MI.eraseFromParent();
2937 std::optional<ValueAndVReg> MaybeIdxVal =
2942 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2945 if (IdxVal < NumElts) {
2947 for (
unsigned i = 0; i < NumElts; ++i)
2948 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2949 B.buildUnmerge(SrcRegs, Vec);
2951 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2952 B.buildMergeLikeInstr(Dst, SrcRegs);
2957 MI.eraseFromParent();
2967 LLT Ty =
MRI.getType(DstReg);
2968 unsigned Flags =
MI.getFlags();
2972 if (ST.hasTrigReducedRange()) {
2973 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2974 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2975 .addUse(MulVal.getReg(0))
2979 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2982 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2986 MI.eraseFromParent();
2994 unsigned GAFlags)
const {
3023 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3025 if (ST.has64BitLiterals()) {
3029 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3033 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3042 if (!
B.getMRI()->getRegClassOrNull(PCReg))
3043 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3046 B.buildExtract(DstReg, PCReg, 0);
3056 if (RequiresHighHalf && ST.has64BitLiterals()) {
3057 if (!
MRI.getRegClassOrNull(DstReg))
3058 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3059 B.buildInstr(AMDGPU::S_MOV_B64)
3069 Register AddrLo = !RequiresHighHalf && !
MRI.getRegClassOrNull(DstReg)
3071 :
MRI.createGenericVirtualRegister(
S32);
3073 if (!
MRI.getRegClassOrNull(AddrLo))
3074 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3077 B.buildInstr(AMDGPU::S_MOV_B32)
3082 if (RequiresHighHalf) {
3084 "Must provide a 64-bit pointer type!");
3087 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3089 B.buildInstr(AMDGPU::S_MOV_B32)
3099 if (!
MRI.getRegClassOrNull(AddrDst))
3100 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3102 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3106 if (AddrDst != DstReg)
3107 B.buildCast(DstReg, AddrDst);
3108 }
else if (AddrLo != DstReg) {
3111 B.buildCast(DstReg, AddrLo);
3119 LLT Ty =
MRI.getType(DstReg);
3120 unsigned AS = Ty.getAddressSpace();
3128 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
3132 Fn,
"local memory global used by non-kernel function",
3141 B.buildUndef(DstReg);
3142 MI.eraseFromParent();
3162 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3166 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3167 B.buildIntToPtr(DstReg, Sz);
3168 MI.eraseFromParent();
3175 MI.eraseFromParent();
3179 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3181 MI.eraseFromParent();
3189 MI.eraseFromParent();
3195 MI.eraseFromParent();
3200 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
3211 if (Ty.getSizeInBits() == 32) {
3213 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3214 B.buildExtract(DstReg, Load, 0);
3216 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3218 MI.eraseFromParent();
3236 LLT PtrTy =
MRI.getType(PtrReg);
3241 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3243 MI.getOperand(1).setReg(Cast.getReg(0));
3248 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3252 LLT ValTy =
MRI.getType(ValReg);
3262 const unsigned ValSize = ValTy.getSizeInBits();
3274 if (WideMemSize == ValSize) {
3280 MI.setMemRefs(MF, {WideMMO});
3286 if (ValSize > WideMemSize)
3293 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3294 B.buildTrunc(ValReg, WideLoad).getReg(0);
3301 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3302 B.buildExtract(ValReg, WideLoad, 0);
3306 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3307 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3311 MI.eraseFromParent();
3324 Register DataReg =
MI.getOperand(0).getReg();
3325 LLT DataTy =
MRI.getType(DataReg);
3339 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3368 "this should not have been custom lowered");
3370 LLT ValTy =
MRI.getType(CmpVal);
3373 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3375 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3379 .setMemRefs(
MI.memoperands());
3381 MI.eraseFromParent();
3389 switch (
DefMI->getOpcode()) {
3390 case TargetOpcode::G_INTRINSIC: {
3392 case Intrinsic::amdgcn_frexp_mant:
3400 case TargetOpcode::G_FFREXP: {
3401 if (
DefMI->getOperand(0).getReg() == Src)
3405 case TargetOpcode::G_FPEXT: {
3426std::pair<Register, Register>
3428 unsigned Flags)
const {
3433 auto SmallestNormal =
B.buildFConstant(
3435 auto IsLtSmallestNormal =
3438 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3439 auto One =
B.buildFConstant(
F32, 1.0);
3441 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3442 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3444 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3457 LLT Ty =
B.getMRI()->getType(Dst);
3458 unsigned Flags =
MI.getFlags();
3463 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3464 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3465 .addUse(Ext.getReg(0))
3467 B.buildFPTrunc(Dst,
Log2, Flags);
3468 MI.eraseFromParent();
3476 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3479 MI.eraseFromParent();
3483 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3484 .addUse(ScaledInput)
3487 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3488 auto Zero =
B.buildFConstant(Ty, 0.0);
3490 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3491 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3493 MI.eraseFromParent();
3499 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3500 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3505 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3506 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3511 unsigned Flags =
MI.getFlags();
3512 const LLT Ty =
MRI.getType(
X);
3522 if (Ty == F16 && !ST.has16BitInsts()) {
3524 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3526 B.buildFPTrunc(Dst, LogVal);
3531 MI.eraseFromParent();
3540 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3543 if (ST.hasFastFMAF32()) {
3545 const float c_log10 = 0x1.344134p-2f;
3546 const float cc_log10 = 0x1.09f79ep-26f;
3549 const float c_log = 0x1.62e42ep-1f;
3550 const float cc_log = 0x1.efa39ep-25f;
3552 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3553 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3555 R =
B.buildFMul(Ty,
Y,
C, Flags).getReg(0);
3556 auto NegR =
B.buildFNeg(Ty, R, Flags);
3557 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, Flags);
3558 auto FMA1 =
B.buildFMA(Ty,
Y, CC, FMA0, Flags);
3559 R =
B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3562 const float ch_log10 = 0x1.344000p-2f;
3563 const float ct_log10 = 0x1.3509f6p-18f;
3566 const float ch_log = 0x1.62e000p-1f;
3567 const float ct_log = 0x1.0bfbe8p-15f;
3569 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3570 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3572 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3573 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3574 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3575 auto YTCT =
B.buildFMul(Ty, YT, CT, Flags);
3578 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3580 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, Flags);
3583 const bool IsFiniteOnly =
3587 if (!IsFiniteOnly) {
3590 auto Fabs =
B.buildFAbs(Ty,
Y);
3593 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3597 auto Zero =
B.buildFConstant(Ty, 0.0);
3599 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3600 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3601 B.buildFSub(Dst, R, Shift, Flags);
3603 B.buildCopy(Dst, R);
3606 MI.eraseFromParent();
3612 unsigned Flags)
const {
3613 const double Log2BaseInverted =
3616 LLT Ty =
B.getMRI()->getType(Dst);
3621 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3624 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3625 auto Zero =
B.buildFConstant(Ty, 0.0);
3627 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3628 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3630 if (ST.hasFastFMAF32())
3631 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3633 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3634 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3642 ?
B.buildFLog2(Ty, Src, Flags)
3643 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3646 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3647 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3658 unsigned Flags =
MI.getFlags();
3659 LLT Ty =
B.getMRI()->getType(Dst);
3665 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3666 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3667 .addUse(Ext.getReg(0))
3669 B.buildFPTrunc(Dst,
Log2, Flags);
3670 MI.eraseFromParent();
3680 MI.eraseFromParent();
3688 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3690 RangeCheckConst, Flags);
3692 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3693 auto Zero =
B.buildFConstant(Ty, 0.0);
3694 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3695 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3697 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3698 .addUse(AddInput.getReg(0))
3701 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3702 auto One =
B.buildFConstant(Ty, 1.0);
3703 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3704 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3705 MI.eraseFromParent();
3711 LLT Ty =
B.getMRI()->getType(Dst);
3716 auto Mul =
B.buildFMul(Ty,
X, Log2E, Flags);
3720 .addUse(
Mul.getReg(0))
3723 B.buildFExp2(Dst,
Mul.getReg(0), Flags);
3729 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3732 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3733 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3734 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3737 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3739 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3740 .addUse(ExpInput.getReg(0))
3743 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3744 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3745 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3753 const unsigned Flags =
MI.getFlags();
3756 LLT Ty =
MRI.getType(Dst);
3759 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
3766 MI.eraseFromParent();
3774 auto Ext =
B.buildFPExt(
F32,
X, Flags);
3777 B.buildFPTrunc(Dst, Lowered, Flags);
3778 MI.eraseFromParent();
3788 MI.eraseFromParent();
3816 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
3819 if (ST.hasFastFMAF32()) {
3821 const float cc_exp = 0x1.4ae0bep-26f;
3822 const float c_exp10 = 0x1.a934f0p+1f;
3823 const float cc_exp10 = 0x1.2f346ep-24f;
3825 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3826 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
3827 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
3828 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
3830 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3831 PL =
B.buildFMA(Ty,
X, CC, FMA0, Flags).getReg(0);
3833 const float ch_exp = 0x1.714000p+0f;
3834 const float cl_exp = 0x1.47652ap-12f;
3836 const float ch_exp10 = 0x1.a92000p+1f;
3837 const float cl_exp10 = 0x1.4f0978p-11f;
3839 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3840 auto XH =
B.buildAnd(Ty,
X, MaskConst);
3841 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
3843 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3844 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
3846 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3847 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
3850 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
3851 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3854 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
3857 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
3858 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
3861 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3862 .addUse(
A.getReg(0))
3864 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
3866 auto UnderflowCheckConst =
3867 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3868 auto Zero =
B.buildFConstant(Ty, 0.0);
3872 R =
B.buildSelect(Ty, Underflow, Zero, R);
3877 auto OverflowCheckConst =
3878 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3883 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
3886 B.buildCopy(Dst, R);
3887 MI.eraseFromParent();
3896 unsigned Flags =
MI.getFlags();
3897 LLT Ty =
B.getMRI()->getType(Dst);
3902 auto Log =
B.buildFLog2(
F32, Src0, Flags);
3903 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3904 .addUse(Log.getReg(0))
3907 B.buildFExp2(Dst,
Mul, Flags);
3908 }
else if (Ty == F16) {
3910 auto Log =
B.buildFLog2(F16, Src0, Flags);
3911 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
3912 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
3913 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3914 .addUse(Ext0.getReg(0))
3915 .addUse(Ext1.getReg(0))
3917 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
3921 MI.eraseFromParent();
3929 ModSrc = SrcFNeg->getOperand(1).getReg();
3931 ModSrc = SrcFAbs->getOperand(1).getReg();
3933 ModSrc = SrcFAbs->getOperand(1).getReg();
3944 Register OrigSrc =
MI.getOperand(1).getReg();
3945 unsigned Flags =
MI.getFlags();
3947 "this should not have been custom lowered");
3957 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
3977 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3979 B.buildFMinNum(Min, Fract, Const, Flags);
3984 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
3987 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
3988 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3990 MI.eraseFromParent();
4006 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4008 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
4009 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
4012 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
4013 B.buildBitcast(Dst,
Merge);
4015 MI.eraseFromParent();
4032 bool UsePartialMad64_32,
4033 bool SeparateOddAlignedProducts)
const {
4048 auto getZero32 = [&]() ->
Register {
4050 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
4053 auto getZero64 = [&]() ->
Register {
4055 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
4060 for (
unsigned i = 0; i < Src0.
size(); ++i) {
4071 if (CarryIn.empty())
4074 bool HaveCarryOut =
true;
4076 if (CarryIn.size() == 1) {
4078 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4082 CarryAccum = getZero32();
4084 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4085 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4087 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
4092 LocalAccum = getZero32();
4093 HaveCarryOut =
false;
4098 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
4099 LocalAccum =
Add.getReg(0);
4113 auto buildMadChain =
4116 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
4117 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
4124 if (LocalAccum.size() == 1 &&
4125 (!UsePartialMad64_32 || !CarryIn.empty())) {
4128 unsigned j1 = DstIndex - j0;
4129 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4133 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
4135 LocalAccum[0] =
Mul.getReg(0);
4137 if (CarryIn.empty()) {
4138 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
4141 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4147 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4151 if (j0 <= DstIndex) {
4152 bool HaveSmallAccum =
false;
4155 if (LocalAccum[0]) {
4156 if (LocalAccum.size() == 1) {
4157 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4158 HaveSmallAccum =
true;
4159 }
else if (LocalAccum[1]) {
4160 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4161 HaveSmallAccum =
false;
4163 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4164 HaveSmallAccum =
true;
4167 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4169 HaveSmallAccum =
true;
4173 unsigned j1 = DstIndex - j0;
4174 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4178 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4179 {Src0[j0], Src1[j1], Tmp});
4180 Tmp = Mad.getReg(0);
4181 if (!HaveSmallAccum)
4182 CarryOut.push_back(Mad.getReg(1));
4183 HaveSmallAccum =
false;
4186 }
while (j0 <= DstIndex);
4188 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4189 LocalAccum[0] = Unmerge.getReg(0);
4190 if (LocalAccum.size() > 1)
4191 LocalAccum[1] = Unmerge.getReg(1);
4218 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4219 Carry OddCarryIn = std::move(OddCarry);
4220 Carry EvenCarryIn = std::move(EvenCarry);
4225 if (2 * i < Accum.
size()) {
4226 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4227 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4232 if (!SeparateOddAlignedProducts) {
4233 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4234 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4236 bool IsHighest = 2 * i >= Accum.
size();
4239 .take_front(IsHighest ? 1 : 2);
4240 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4246 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4248 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4250 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4253 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4256 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4257 Lo->getOperand(1).getReg());
4258 Accum[2 * i] =
Hi.getReg(0);
4259 SeparateOddCarry =
Hi.getReg(1);
4266 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4267 EvenCarryIn.push_back(CarryOut);
4269 if (2 * i < Accum.
size()) {
4270 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4271 OddCarry.push_back(CarryOut);
4283 assert(ST.hasMad64_32());
4284 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4293 LLT Ty =
MRI.getType(DstReg);
4296 unsigned Size = Ty.getSizeInBits();
4297 if (ST.hasVectorMulU64() &&
Size == 64)
4300 unsigned NumParts =
Size / 32;
4312 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4316 for (
unsigned i = 0; i < NumParts; ++i) {
4320 B.buildUnmerge(Src0Parts, Src0);
4321 B.buildUnmerge(Src1Parts, Src1);
4324 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4325 SeparateOddAlignedProducts);
4327 B.buildMergeLikeInstr(DstReg, AccumRegs);
4328 MI.eraseFromParent();
4340 LLT DstTy =
MRI.getType(Dst);
4341 LLT SrcTy =
MRI.getType(Src);
4343 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4344 ? AMDGPU::G_AMDGPU_FFBH_U32
4345 : AMDGPU::G_AMDGPU_FFBL_B32;
4346 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4349 MI.eraseFromParent();
4358 LLT SrcTy =
MRI.getType(Src);
4359 TypeSize NumBits = SrcTy.getSizeInBits();
4363 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4364 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4365 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4366 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4367 B.buildTrunc(Dst, Ctlz);
4368 MI.eraseFromParent();
4374 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4377 return ConstVal == -1;
4384 Register CondDef =
MI.getOperand(0).getReg();
4385 if (!
MRI.hasOneNonDBGUse(CondDef))
4393 if (!
MRI.hasOneNonDBGUse(NegatedCond))
4399 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
4403 if (
UseMI->getParent() != Parent ||
UseMI->getOpcode() != AMDGPU::G_BRCOND)
4412 UncondBrTarget = &*NextMBB;
4414 if (
Next->getOpcode() != AMDGPU::G_BR)
4433 *ArgRC,
B.getDebugLoc(), ArgTy);
4437 const unsigned Mask = Arg->
getMask();
4445 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4446 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4449 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4451 B.buildCopy(DstReg, LiveIn);
4461 if (!ST.hasClusters()) {
4464 MI.eraseFromParent();
4477 Register ClusterMaxIdXYZ =
MRI.createGenericVirtualRegister(
S32);
4478 Register ClusterWorkGroupIdXYZ =
MRI.createGenericVirtualRegister(
S32);
4484 auto One =
B.buildConstant(
S32, 1);
4485 auto ClusterSizeXYZ =
B.buildAdd(
S32, ClusterMaxIdXYZ, One);
4486 auto GlobalIdXYZ =
B.buildAdd(
S32, ClusterWorkGroupIdXYZ,
4487 B.buildMul(
S32, ClusterIdXYZ, ClusterSizeXYZ));
4494 B.buildCopy(DstReg, GlobalIdXYZ);
4495 MI.eraseFromParent();
4499 B.buildCopy(DstReg, ClusterIdXYZ);
4500 MI.eraseFromParent();
4505 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4507 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4508 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4510 .addImm(ClusterIdField);
4511 auto Zero =
B.buildConstant(
S32, 0);
4514 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4515 MI.eraseFromParent();
4557 auto LoadConstant = [&](
unsigned N) {
4558 B.buildConstant(DstReg,
N);
4562 if (ST.hasArchitectedSGPRs() &&
4569 Arg = &WorkGroupIDX;
4570 ArgRC = &AMDGPU::SReg_32RegClass;
4574 Arg = &WorkGroupIDY;
4575 ArgRC = &AMDGPU::SReg_32RegClass;
4579 Arg = &WorkGroupIDZ;
4580 ArgRC = &AMDGPU::SReg_32RegClass;
4584 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
4585 return LoadConstant(0);
4586 Arg = &ClusterWorkGroupIDX;
4587 ArgRC = &AMDGPU::SReg_32RegClass;
4591 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
4592 return LoadConstant(0);
4593 Arg = &ClusterWorkGroupIDY;
4594 ArgRC = &AMDGPU::SReg_32RegClass;
4598 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
4599 return LoadConstant(0);
4600 Arg = &ClusterWorkGroupIDZ;
4601 ArgRC = &AMDGPU::SReg_32RegClass;
4606 return LoadConstant(ClusterDims.
getDims()[0] - 1);
4607 Arg = &ClusterWorkGroupMaxIDX;
4608 ArgRC = &AMDGPU::SReg_32RegClass;
4613 return LoadConstant(ClusterDims.
getDims()[1] - 1);
4614 Arg = &ClusterWorkGroupMaxIDY;
4615 ArgRC = &AMDGPU::SReg_32RegClass;
4620 return LoadConstant(ClusterDims.
getDims()[2] - 1);
4621 Arg = &ClusterWorkGroupMaxIDZ;
4622 ArgRC = &AMDGPU::SReg_32RegClass;
4626 Arg = &ClusterWorkGroupMaxFlatID;
4627 ArgRC = &AMDGPU::SReg_32RegClass;
4642 return LoadConstant(0);
4647 B.buildUndef(DstReg);
4651 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4663 MI.eraseFromParent();
4669 B.buildConstant(
MI.getOperand(0).getReg(),
C);
4670 MI.eraseFromParent();
4677 unsigned MaxID = ST.getMaxWorkitemID(
B.getMF().getFunction(), Dim);
4691 B.buildUndef(DstReg);
4692 MI.eraseFromParent();
4696 if (Arg->isMasked()) {
4710 MI.eraseFromParent();
4717 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
4726 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
4734 Align Alignment)
const {
4738 "unexpected kernarg parameter type");
4742 B.buildLoad(DstReg,
Ptr, PtrInfo,
Align(4),
4745 MI.eraseFromParent();
4753 LLT DstTy =
MRI.getType(Dst);
4780 auto FloatY =
B.buildUITOFP(
S32,
Y);
4781 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
4783 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
4784 auto Z =
B.buildFPTOUI(
S32, ScaledY);
4787 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
4788 auto NegYZ =
B.buildMul(
S32, NegY, Z);
4789 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
4792 auto Q =
B.buildUMulH(
S32,
X, Z);
4793 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
4796 auto One =
B.buildConstant(
S32, 1);
4799 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
4805 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
4808 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
4827 auto Unmerge =
B.buildUnmerge(
S32, Val);
4829 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
4830 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
4832 auto Mad =
B.buildFMAD(
4836 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
4837 auto Mul1 =
B.buildFMul(
4841 auto Mul2 =
B.buildFMul(
4843 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
4846 auto Mad2 =
B.buildFMAD(
4850 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
4851 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
4853 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4868 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
4870 auto Zero64 =
B.buildConstant(
S64, 0);
4871 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
4873 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
4874 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
4876 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
4877 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4878 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4880 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
4881 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4882 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
4884 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
4885 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
4886 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
4887 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4888 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4890 auto Zero32 =
B.buildConstant(
S32, 0);
4891 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
4892 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4893 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
4895 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
4896 Register NumerLo = UnmergeNumer.getReg(0);
4897 Register NumerHi = UnmergeNumer.getReg(1);
4899 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
4900 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
4901 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
4902 Register Mul3_Lo = UnmergeMul3.getReg(0);
4903 Register Mul3_Hi = UnmergeMul3.getReg(1);
4904 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
4905 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4906 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
4907 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
4909 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
4910 Register DenomLo = UnmergeDenom.getReg(0);
4911 Register DenomHi = UnmergeDenom.getReg(1);
4914 auto C1 =
B.buildSExt(
S32, CmpHi);
4917 auto C2 =
B.buildSExt(
S32, CmpLo);
4920 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
4927 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
4928 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4929 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4930 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
4932 auto One64 =
B.buildConstant(
S64, 1);
4933 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
4939 auto C6 =
B.buildSelect(
4943 auto Add4 =
B.buildAdd(
S64, Add3, One64);
4944 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
4946 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4947 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4948 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
4954 auto Sel1 =
B.buildSelect(
4961 auto Sel2 =
B.buildSelect(
4972 switch (
MI.getOpcode()) {
4975 case AMDGPU::G_UDIV: {
4976 DstDivReg =
MI.getOperand(0).getReg();
4979 case AMDGPU::G_UREM: {
4980 DstRemReg =
MI.getOperand(0).getReg();
4983 case AMDGPU::G_UDIVREM: {
4984 DstDivReg =
MI.getOperand(0).getReg();
4985 DstRemReg =
MI.getOperand(1).getReg();
4992 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4993 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
4994 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
4995 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5004 MI.eraseFromParent();
5014 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5015 if (Ty !=
S32 && Ty !=
S64)
5018 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5019 Register LHS =
MI.getOperand(FirstSrcOpIdx).getReg();
5020 Register RHS =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5022 auto SignBitOffset =
B.buildConstant(
S32, Ty.getSizeInBits() - 1);
5023 auto LHSign =
B.buildAShr(Ty, LHS, SignBitOffset);
5024 auto RHSign =
B.buildAShr(Ty, RHS, SignBitOffset);
5026 LHS =
B.buildAdd(Ty, LHS, LHSign).getReg(0);
5027 RHS =
B.buildAdd(Ty, RHS, RHSign).getReg(0);
5029 LHS =
B.buildXor(Ty, LHS, LHSign).getReg(0);
5030 RHS =
B.buildXor(Ty, RHS, RHSign).getReg(0);
5032 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5033 switch (
MI.getOpcode()) {
5036 case AMDGPU::G_SDIV: {
5037 DstDivReg =
MI.getOperand(0).getReg();
5038 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
5041 case AMDGPU::G_SREM: {
5042 DstRemReg =
MI.getOperand(0).getReg();
5043 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
5046 case AMDGPU::G_SDIVREM: {
5047 DstDivReg =
MI.getOperand(0).getReg();
5048 DstRemReg =
MI.getOperand(1).getReg();
5049 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
5050 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
5061 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
5062 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5063 B.buildSub(DstDivReg, SignXor, Sign);
5067 auto Sign = LHSign.getReg(0);
5068 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5069 B.buildSub(DstRemReg, SignXor, Sign);
5072 MI.eraseFromParent();
5083 LLT ResTy =
MRI.getType(Res);
5088 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
5099 if (CLHS->isExactlyValue(1.0)) {
5100 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5104 MI.eraseFromParent();
5109 if (CLHS->isExactlyValue(-1.0)) {
5110 auto FNeg =
B.buildFNeg(ResTy, RHS, Flags);
5111 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5112 .addUse(FNeg.getReg(0))
5115 MI.eraseFromParent();
5122 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
5127 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5130 B.buildFMul(Res, LHS, RCP, Flags);
5132 MI.eraseFromParent();
5143 LLT ResTy =
MRI.getType(Res);
5147 if (!AllowInaccurateRcp)
5150 auto NegY =
B.buildFNeg(ResTy,
Y);
5151 auto One =
B.buildFConstant(ResTy, 1.0);
5153 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5157 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
5158 R =
B.buildFMA(ResTy, Tmp0, R, R);
5160 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
5161 R =
B.buildFMA(ResTy, Tmp1, R, R);
5163 auto Ret =
B.buildFMul(ResTy,
X, R);
5164 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
5166 B.buildFMA(Res, Tmp2, R, Ret);
5167 MI.eraseFromParent();
5199 auto LHSExt =
B.buildFPExt(
S32, LHS, Flags);
5200 auto RHSExt =
B.buildFPExt(
S32, RHS, Flags);
5201 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
5202 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5203 .addUse(RHSExt.getReg(0))
5205 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
5207 if (ST.hasMadMacF32Insts()) {
5208 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5209 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
5210 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5212 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5213 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
5214 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5216 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
5217 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
5218 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
5219 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
5220 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5221 .addUse(RDst.getReg(0))
5226 MI.eraseFromParent();
5239 unsigned SPDenormMode =
5242 if (ST.hasDenormModeInst()) {
5244 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
5246 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5247 B.buildInstr(AMDGPU::S_DENORM_MODE)
5248 .addImm(NewDenormModeValue);
5251 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5252 .addImm(SPDenormMode)
5274 auto One =
B.buildFConstant(
S32, 1.0f);
5276 auto DenominatorScaled =
5277 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5282 auto NumeratorScaled =
5283 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5289 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5290 .addUse(DenominatorScaled.getReg(0))
5292 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5295 const bool HasDynamicDenormals =
5300 if (!PreservesDenormals) {
5301 if (HasDynamicDenormals) {
5302 SavedSPDenormMode =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5303 B.buildInstr(AMDGPU::S_GETREG_B32)
5304 .addDef(SavedSPDenormMode)
5310 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5311 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5312 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5313 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5314 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5315 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5317 if (!PreservesDenormals) {
5318 if (HasDynamicDenormals) {
5319 assert(SavedSPDenormMode);
5320 B.buildInstr(AMDGPU::S_SETREG_B32)
5321 .addReg(SavedSPDenormMode)
5327 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5328 .addUse(Fma4.getReg(0))
5329 .addUse(Fma1.getReg(0))
5330 .addUse(Fma3.getReg(0))
5331 .addUse(NumeratorScaled.getReg(1))
5334 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5335 .addUse(Fmas.getReg(0))
5340 MI.eraseFromParent();
5359 auto One =
B.buildFConstant(
S64, 1.0);
5361 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5367 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5369 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5370 .addUse(DivScale0.getReg(0))
5373 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5374 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5375 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5377 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5383 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5384 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5385 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5388 if (!ST.hasUsableDivScaleConditionOutput()) {
5394 auto NumUnmerge =
B.buildUnmerge(
S32, LHS);
5395 auto DenUnmerge =
B.buildUnmerge(
S32, RHS);
5396 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5397 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5400 Scale1Unmerge.getReg(1));
5402 Scale0Unmerge.getReg(1));
5403 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5405 Scale = DivScale1.getReg(1);
5408 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5409 .addUse(Fma4.getReg(0))
5410 .addUse(Fma3.getReg(0))
5411 .addUse(
Mul.getReg(0))
5415 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5416 .addUse(Fmas.getReg(0))
5421 MI.eraseFromParent();
5433 LLT Ty =
MRI.getType(Res0);
5436 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5439 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5443 if (ST.hasFractBug()) {
5444 auto Fabs =
B.buildFAbs(Ty, Val);
5448 auto Zero =
B.buildConstant(InstrExpTy, 0);
5449 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5450 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5453 B.buildCopy(Res0, Mant);
5454 B.buildSExtOrTrunc(Res1, Exp);
5456 MI.eraseFromParent();
5471 auto Abs =
B.buildFAbs(
S32, RHS, Flags);
5474 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5475 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5476 auto C2 =
B.buildFConstant(
S32, 1.0f);
5479 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5481 auto Mul0 =
B.buildFMul(
S32, RHS, Sel, Flags);
5483 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5484 .addUse(Mul0.getReg(0))
5487 auto Mul1 =
B.buildFMul(
S32, LHS, RCP, Flags);
5489 B.buildFMul(Res, Sel, Mul1, Flags);
5491 MI.eraseFromParent();
5500 unsigned Flags =
MI.getFlags();
5501 assert(!ST.has16BitInsts());
5503 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5504 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5505 .addUse(Ext.getReg(0))
5507 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5508 MI.eraseFromParent();
5518 const unsigned Flags =
MI.getFlags();
5527 MI.eraseFromParent();
5531 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5533 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5534 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5535 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5540 .addUse(SqrtX.getReg(0))
5543 auto NegOne =
B.buildConstant(I32, -1);
5544 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5546 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5547 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5549 auto PosOne =
B.buildConstant(I32, 1);
5550 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5552 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5553 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5555 auto Zero =
B.buildFConstant(
F32, 0.0f);
5559 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5563 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5566 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5567 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5569 auto Half =
B.buildFConstant(
F32, 0.5f);
5570 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5571 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5572 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5573 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5574 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5575 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5576 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5577 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5580 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5582 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5584 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5587 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5589 MI.eraseFromParent();
5621 assert(
MRI.getType(Dst) ==
F64 &&
"only expect to lower f64 sqrt");
5624 unsigned Flags =
MI.getFlags();
5626 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
5628 auto ZeroInt =
B.buildConstant(
S32, 0);
5632 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
5633 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
5634 auto SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags);
5637 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX.getReg(0));
5639 auto Half =
B.buildFConstant(
F64, 0.5);
5640 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
5641 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
5643 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
5644 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
5646 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
5647 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
5649 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
5650 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
5652 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
5654 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
5655 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
5657 auto SqrtRet =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
5660 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
5661 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
5662 SqrtRet =
B.buildFLdexp(
F64, SqrtRet, ScaleDown, Flags);
5671 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5673 MI.eraseFromParent();
5680 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5704 auto Flags =
MI.getFlags();
5706 LLT Ty =
MRI.getType(Dst);
5716 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5726 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5727 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5732 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5734 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5735 MI.eraseFromParent();
5747 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5748 IID == Intrinsic::amdgcn_permlanex16;
5749 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5750 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5754 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
5756 case Intrinsic::amdgcn_readfirstlane:
5757 case Intrinsic::amdgcn_permlane64:
5758 return LaneOp.getReg(0);
5759 case Intrinsic::amdgcn_readlane:
5760 case Intrinsic::amdgcn_set_inactive:
5761 case Intrinsic::amdgcn_set_inactive_chain_arg:
5762 return LaneOp.addUse(Src1).getReg(0);
5763 case Intrinsic::amdgcn_writelane:
5764 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5765 case Intrinsic::amdgcn_permlane16:
5766 case Intrinsic::amdgcn_permlanex16: {
5768 int64_t Src4 =
MI.getOperand(6).getImm();
5769 int64_t Src5 =
MI.getOperand(7).getImm();
5770 return LaneOp.addUse(Src1)
5777 case Intrinsic::amdgcn_mov_dpp8:
5778 return LaneOp.addImm(
MI.getOperand(3).getImm()).getReg(0);
5779 case Intrinsic::amdgcn_update_dpp:
5780 return LaneOp.addUse(Src1)
5781 .addImm(
MI.getOperand(4).getImm())
5782 .addImm(
MI.getOperand(5).getImm())
5783 .addImm(
MI.getOperand(6).getImm())
5784 .addImm(
MI.getOperand(7).getImm())
5794 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5795 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5796 Src1 =
MI.getOperand(3).getReg();
5797 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5798 Src2 =
MI.getOperand(4).getReg();
5802 LLT Ty =
MRI.getType(DstReg);
5803 unsigned Size = Ty.getSizeInBits();
5805 unsigned SplitSize = 32;
5806 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
5807 ST.hasDPALU_DPP() &&
5811 if (
Size == SplitSize) {
5817 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
5819 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5822 if (IID == Intrinsic::amdgcn_writelane)
5825 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
5826 B.buildTrunc(DstReg, LaneOpDst);
5827 MI.eraseFromParent();
5831 if (
Size % SplitSize != 0)
5835 bool NeedsBitcast =
false;
5836 if (Ty.isVector()) {
5839 if (EltSize == SplitSize) {
5840 PartialResTy = EltTy;
5841 }
else if (EltSize == 16 || EltSize == 32) {
5842 unsigned NElem = SplitSize / EltSize;
5846 NeedsBitcast =
true;
5851 unsigned NumParts =
Size / SplitSize;
5855 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5856 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
5858 if (IID == Intrinsic::amdgcn_writelane)
5859 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
5861 for (
unsigned i = 0; i < NumParts; ++i) {
5862 Src0 = Src0Parts.
getReg(i);
5864 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5865 Src1 = Src1Parts.
getReg(i);
5867 if (IID == Intrinsic::amdgcn_writelane)
5868 Src2 = Src2Parts.
getReg(i);
5870 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5874 B.buildBitcast(DstReg,
B.buildMergeLikeInstr(
5877 B.buildMergeLikeInstr(DstReg, PartialRes);
5879 MI.eraseFromParent();
5887 ST.getTargetLowering()->getImplicitParameterOffset(
5889 LLT DstTy =
MRI.getType(DstReg);
5892 Register KernargPtrReg =
MRI.createGenericVirtualRegister(DstTy);
5897 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
5898 B.buildConstant(IdxTy,
Offset).getReg(0));
5909 Register Pointer =
MI.getOperand(2).getReg();
5911 Register NumRecords =
MI.getOperand(4).getReg();
5916 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
5917 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
5918 Register LowHalf = Unmerge.getReg(0);
5919 Register HighHalf = Unmerge.getReg(1);
5921 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
5922 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
5925 std::optional<ValueAndVReg> StrideConst =
5927 if (!StrideConst || !StrideConst->Value.isZero()) {
5930 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5931 uint32_t ShiftedStrideVal = StrideVal << 16;
5932 ShiftedStride =
B.buildConstant(
S32, ShiftedStrideVal);
5934 auto ExtStride =
B.buildAnyExt(
S32, Stride);
5935 auto ShiftConst =
B.buildConstant(
S32, 16);
5936 ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
5938 NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
5941 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5942 MI.eraseFromParent();
5959 MI.eraseFromParent();
5967 std::optional<uint32_t> KnownSize =
5969 if (KnownSize.has_value())
5970 B.buildConstant(DstReg, *KnownSize);
5988 MI.eraseFromParent();
5995 unsigned AddrSpace)
const {
5997 auto Unmerge =
B.buildUnmerge(
S32,
MI.getOperand(2).getReg());
6001 ST.hasGloballyAddressableScratch()) {
6003 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
6004 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6006 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6008 Register XOR =
B.buildXor(
S32, Hi32, FlatScratchBaseHi).getReg(0);
6010 B.buildConstant(
S32, 1u << 26));
6015 MI.eraseFromParent();
6025std::pair<Register, unsigned>
6039 MRI, OrigOffset,
nullptr, CheckNUW);
6042 if (
MRI.getType(BaseReg).isPointer())
6043 BaseReg =
B.buildPtrToInt(
MRI.getType(OrigOffset), BaseReg).getReg(0);
6053 unsigned Overflow = ImmOffset & ~MaxImm;
6054 ImmOffset -= Overflow;
6055 if ((int32_t)Overflow < 0) {
6056 Overflow += ImmOffset;
6060 if (Overflow != 0) {
6062 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
6064 auto OverflowVal =
B.buildConstant(
S32, Overflow);
6065 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
6070 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
6072 return std::pair(BaseReg, ImmOffset);
6079 bool ImageStore)
const {
6082 LLT StoreVT =
MRI.getType(Reg);
6085 if (ST.hasUnpackedD16VMem()) {
6086 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6089 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6090 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
6098 if (ImageStore && ST.hasImageStoreD16Bug()) {
6101 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
6103 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
6110 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6111 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6113 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
6121 auto Unmerge =
B.buildUnmerge(
S32, Reg);
6122 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6124 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
6141 bool IsFormat)
const {
6143 LLT Ty =
MRI->getType(VData);
6153 VData =
B.buildBitcast(Ty, VData).getReg(0);
6161 if (Ty.isVector()) {
6162 if (Ty.getElementType() ==
S16 && Ty.getNumElements() <= 4) {
6174 bool IsFormat)
const {
6179 LLT Ty =
MRI.getType(VData);
6181 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6196 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6199 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6203 VIndex =
MI.getOperand(3).getReg();
6206 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6209 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6210 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6214 Format =
MI.getOperand(5 + OpOffset).getImm();
6218 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6224 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6225 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6226 }
else if (IsFormat) {
6227 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6228 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6232 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6235 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6238 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6243 auto MIB =
B.buildInstr(
Opc)
6254 MIB.addImm(AuxiliaryData)
6255 .addImm(HasVIndex ? -1 : 0)
6256 .addMemOperand(MMO);
6258 MI.eraseFromParent();
6264 unsigned ImmOffset,
unsigned Format,
6267 auto MIB =
B.buildInstr(
Opc)
6278 MIB.addImm(AuxiliaryData)
6279 .addImm(HasVIndex ? -1 : 0)
6280 .addMemOperand(MMO);
6286 bool IsTyped)
const {
6300 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
6301 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6303 StatusDst =
MI.getOperand(1).getReg();
6308 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6311 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6314 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6317 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6320 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6323 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6324 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6328 Format =
MI.getOperand(5 + OpOffset).getImm();
6332 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6335 LLT Ty =
MRI.getType(Dst);
6342 Dst =
MI.getOperand(0).getReg();
6343 B.setInsertPt(
B.getMBB(),
MI);
6350 Dst =
MI.getOperand(0).getReg();
6351 B.setInsertPt(
B.getMBB(),
MI);
6355 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6356 const bool Unpacked = ST.hasUnpackedD16VMem();
6366 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6367 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6368 }
else if (IsFormat) {
6372 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6374 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6375 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6380 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6381 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6384 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6385 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6388 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6389 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6395 unsigned NumValueDWords =
divideCeil(Ty.getSizeInBits(), 32);
6396 unsigned NumLoadDWords = NumValueDWords + 1;
6398 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6400 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6402 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6403 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6404 B.buildTrunc(Dst, ExtDst);
6405 }
else if (NumValueDWords == 1) {
6406 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6409 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6410 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6412 B.buildUnmerge(LoadElts, LoadDstReg);
6414 B.buildMergeLikeInstr(Dst, LoadElts);
6417 (IsD16 && !Ty.isVector())) {
6418 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6420 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6421 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6422 B.buildTrunc(Dst, LoadDstReg);
6423 }
else if (Unpacked && IsD16 && Ty.isVector()) {
6425 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6427 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6428 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6430 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6432 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6433 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6434 B.buildMergeLikeInstr(Dst, Repack);
6437 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6440 MI.eraseFromParent();
6446 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6447 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6448 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6449 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6450 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6451 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6452 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6453 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6454 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6455 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6456 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6457 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6458 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6459 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6460 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6461 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6462 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6463 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6464 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6465 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6466 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6467 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6468 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6469 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6470 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6471 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6472 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6473 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6474 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6475 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6476 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6477 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6478 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6479 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6480 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6481 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6482 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6483 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6484 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6485 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6486 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6487 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6488 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6489 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6490 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6491 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6492 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6493 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6494 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6495 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6496 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6497 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6498 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6499 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6500 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6501 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6502 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6503 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6504 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6505 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6506 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6507 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6508 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6509 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6510 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6511 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6512 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6513 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6514 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6515 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6516 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6517 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6518 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6519 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6520 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6521 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6522 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6523 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6524 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6525 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6526 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6527 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6528 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6537 const bool IsCmpSwap =
6538 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6539 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6540 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6541 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6552 CmpVal =
MI.getOperand(3).getReg();
6557 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6558 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6561 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6564 VIndex =
MI.getOperand(4 + OpOffset).getReg();
6567 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
6570 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
6571 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
6572 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
6591 .addImm(AuxiliaryData)
6592 .addImm(HasVIndex ? -1 : 0)
6593 .addMemOperand(MMO);
6595 MI.eraseFromParent();
6605 bool IsA16,
bool IsG16) {
6621 (
B.getMRI()->getType(AddrReg) ==
S16)) {
6626 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6630 "Bias needs to be converted to 16 bit in A16 mode");
6632 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
6638 if (((
I + 1) >= EndIdx) ||
6645 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
6647 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6652 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6663 int DimIdx,
int NumVAddrs) {
6667 for (
int I = 0;
I != NumVAddrs; ++
I) {
6669 if (
SrcOp.isReg()) {
6675 int NumAddrRegs = AddrRegs.
size();
6676 if (NumAddrRegs != 1) {
6679 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6682 for (
int I = 1;
I != NumVAddrs; ++
I) {
6685 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
6707 const unsigned NumDefs =
MI.getNumExplicitDefs();
6708 const unsigned ArgOffset = NumDefs + 1;
6709 bool IsTFE = NumDefs == 2;
6727 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6728 Ty =
MRI->getType(VData);
6731 const bool IsAtomicPacked16Bit =
6732 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6733 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6741 ST.hasG16() ? (BaseOpcode->
Gradients && GradTy ==
S16) : GradTy ==
S16;
6742 const bool IsA16 = AddrTy ==
S16;
6743 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() ==
S16;
6746 if (!BaseOpcode->
Atomic) {
6747 DMask =
MI.getOperand(ArgOffset + Intr->
DMaskIndex).getImm();
6750 }
else if (DMask != 0) {
6752 }
else if (!IsTFE && !BaseOpcode->
Store) {
6754 B.buildUndef(
MI.getOperand(0));
6755 MI.eraseFromParent();
6763 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6764 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6765 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6766 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6767 unsigned NewOpcode = LoadOpcode;
6768 if (BaseOpcode->
Store)
6769 NewOpcode = StoreOpcode;
6771 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6774 MI.setDesc(
B.getTII().get(NewOpcode));
6778 if (IsTFE && DMask == 0) {
6781 MI.getOperand(ArgOffset + Intr->
DMaskIndex).setImm(DMask);
6784 if (BaseOpcode->
Atomic) {
6786 LLT Ty =
MRI->getType(VData0);
6789 if (Ty.isVector() && !IsAtomicPacked16Bit)
6796 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
6797 MI.getOperand(2).setReg(
Concat.getReg(0));
6798 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6802 unsigned CorrectedNumVAddrs = Intr->
NumVAddrs;
6805 if (BaseOpcode->
Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6811 if (IsA16 && !ST.hasA16()) {
6816 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->
Sampler);
6817 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6819 if (IsA16 || IsG16) {
6827 const bool UseNSA = ST.hasNSAEncoding() &&
6828 PackedRegs.
size() >= ST.getNSAThreshold(MF) &&
6829 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
6830 const bool UsePartialNSA =
6831 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
6833 if (UsePartialNSA) {
6837 auto Concat =
B.buildConcatVectors(
6838 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6839 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
6840 PackedRegs.
resize(NSAMaxSize);
6841 }
else if (!UseNSA && PackedRegs.
size() > 1) {
6843 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
6844 PackedRegs[0] =
Concat.getReg(0);
6848 const unsigned NumPacked = PackedRegs.
size();
6851 if (!
SrcOp.isReg()) {
6861 SrcOp.setReg(AMDGPU::NoRegister);
6878 const bool UseNSA = ST.hasNSAEncoding() &&
6879 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6880 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6881 const bool UsePartialNSA =
6882 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6884 if (UsePartialNSA) {
6886 ArgOffset + Intr->
VAddrStart + NSAMaxSize - 1,
6888 }
else if (!UseNSA && Intr->
NumVAddrs > 1) {
6903 if (!Ty.isVector() || !IsD16)
6907 if (RepackedReg != VData) {
6908 MI.getOperand(1).setReg(RepackedReg);
6916 const int NumElts = Ty.
isVector() ? Ty.getNumElements() : 1;
6919 if (NumElts < DMaskLanes)
6922 if (NumElts > 4 || DMaskLanes > 4)
6932 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6933 const LLT AdjustedTy =
6949 if (IsD16 && ST.hasUnpackedD16VMem()) {
6956 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
6957 unsigned RoundedSize = 32 * RoundedElts;
6961 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
6966 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
6972 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
6976 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6977 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
6979 Register NewResultReg =
MRI->createGenericVirtualRegister(LoadResultTy);
6981 MI.getOperand(0).setReg(NewResultReg);
6989 Dst1Reg =
MI.getOperand(1).getReg();
6990 if (
MRI->getType(Dst1Reg) !=
S32)
6994 MI.removeOperand(1);
6998 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7007 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7009 if (ResultNumRegs == 1) {
7011 ResultRegs[0] = NewResultReg;
7014 for (
int I = 0;
I != NumDataRegs; ++
I)
7015 ResultRegs[
I] =
MRI->createGenericVirtualRegister(RegTy);
7016 B.buildUnmerge(ResultRegs, NewResultReg);
7021 ResultRegs.
resize(NumDataRegs);
7026 if (IsD16 && !Ty.isVector()) {
7027 B.buildTrunc(DstReg, ResultRegs[0]);
7032 if (Ty ==
V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7033 B.buildBitcast(DstReg, ResultRegs[0]);
7045 if (RegTy !=
V2S16 && !ST.hasUnpackedD16VMem()) {
7047 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
7048 }
else if (ST.hasUnpackedD16VMem()) {
7050 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
7054 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
7057 Register Undef =
B.buildUndef(Ty).getReg(0);
7058 for (
int I = 0;
I != NumElts; ++
I)
7063 LLT ResTy =
MRI->getType(ResultRegs[0]);
7065 padWithUndef(ResTy, NumElts - ResultRegs.
size());
7066 B.buildBuildVector(DstReg, ResultRegs);
7070 assert(!ST.hasUnpackedD16VMem() && ResTy ==
V2S16);
7071 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7077 if (ResultRegs.
size() == 1) {
7078 NewResultReg = ResultRegs[0];
7079 }
else if (ResultRegs.
size() == 2) {
7081 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
7087 if (
MRI->getType(DstReg).getNumElements() <
7088 MRI->getType(NewResultReg).getNumElements()) {
7089 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7091 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7096 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
7097 B.buildConcatVectors(DstReg, ResultRegs);
7106 Register OrigDst =
MI.getOperand(0).getReg();
7108 LLT Ty =
B.getMRI()->getType(OrigDst);
7109 unsigned Size = Ty.getSizeInBits();
7112 if (
Size < 32 && ST.hasScalarSubwordLoads()) {
7114 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7115 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7118 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
7120 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7129 B.setInsertPt(
B.getMBB(),
MI);
7134 B.setInsertPt(
B.getMBB(),
MI);
7140 MI.setDesc(
B.getTII().get(
Opc));
7141 MI.removeOperand(1);
7144 const unsigned MemSize = (
Size + 7) / 8;
7145 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
7152 MI.addMemOperand(MF, MMO);
7153 if (Dst != OrigDst) {
7154 MI.getOperand(0).setReg(Dst);
7155 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
7156 B.buildTrunc(OrigDst, Dst);
7178 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7179 MI.removeOperand(0);
7189 if (!ST.isTrapHandlerEnabled() ||
7193 return ST.supportsGetDoorbellID() ?
7206 MI.eraseFromParent();
7216 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
7218 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7222 MI.eraseFromParent();
7231 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7238 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
7240 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
7256 Register LoadAddr =
MRI.createGenericVirtualRegister(
7258 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7261 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
7262 B.buildCopy(SGPR01, Temp);
7263 B.buildInstr(AMDGPU::S_TRAP)
7266 MI.eraseFromParent();
7277 B.buildCopy(SGPR01, LiveIn);
7278 B.buildInstr(AMDGPU::S_TRAP)
7282 MI.eraseFromParent();
7291 if (ST.hasPrivEnabledTrap2NopBug()) {
7292 ST.getInstrInfo()->insertSimulatedTrap(
MRI,
B.getMBB(),
MI,
7294 MI.eraseFromParent();
7298 B.buildInstr(AMDGPU::S_TRAP)
7300 MI.eraseFromParent();
7309 if (!ST.isTrapHandlerEnabled() ||
7313 Fn,
"debugtrap handler not supported",
MI.getDebugLoc(),
DS_Warning));
7316 B.buildInstr(AMDGPU::S_TRAP)
7320 MI.eraseFromParent();
7333 Register NodePtr =
MI.getOperand(2).getReg();
7334 Register RayExtent =
MI.getOperand(3).getReg();
7335 Register RayOrigin =
MI.getOperand(4).getReg();
7337 Register RayInvDir =
MI.getOperand(6).getReg();
7340 if (!ST.hasGFX10_AEncoding()) {
7343 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7350 const bool IsA16 =
MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7351 const bool Is64 =
MRI.getType(NodePtr).getSizeInBits() == 64;
7352 const unsigned NumVDataDwords = 4;
7353 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7354 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7356 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7358 const unsigned BaseOpcodes[2][2] = {
7359 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7360 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7361 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7365 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7366 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7367 : AMDGPU::MIMGEncGfx10NSA,
7368 NumVDataDwords, NumVAddrDwords);
7372 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7373 : AMDGPU::MIMGEncGfx10Default,
7374 NumVDataDwords, NumVAddrDwords);
7379 if (UseNSA && IsGFX11Plus) {
7381 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7382 auto Merged =
B.buildMergeLikeInstr(
7383 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7384 Ops.push_back(Merged.getReg(0));
7387 Ops.push_back(NodePtr);
7388 Ops.push_back(RayExtent);
7389 packLanes(RayOrigin);
7392 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7393 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7394 auto MergedDir =
B.buildMergeLikeInstr(
7397 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7398 UnmergeRayDir.getReg(0)}))
7401 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7402 UnmergeRayDir.getReg(1)}))
7405 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7406 UnmergeRayDir.getReg(2)}))
7408 Ops.push_back(MergedDir.getReg(0));
7411 packLanes(RayInvDir);
7415 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7416 Ops.push_back(Unmerge.getReg(0));
7417 Ops.push_back(Unmerge.getReg(1));
7419 Ops.push_back(NodePtr);
7421 Ops.push_back(RayExtent);
7424 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7425 Ops.push_back(Unmerge.getReg(0));
7426 Ops.push_back(Unmerge.getReg(1));
7427 Ops.push_back(Unmerge.getReg(2));
7430 packLanes(RayOrigin);
7432 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7433 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7437 B.buildMergeLikeInstr(R1,
7438 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7439 B.buildMergeLikeInstr(
7440 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7441 B.buildMergeLikeInstr(
7442 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7448 packLanes(RayInvDir);
7455 Register MergedOps =
B.buildMergeLikeInstr(OpTy,
Ops).getReg(0);
7457 Ops.push_back(MergedOps);
7460 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7469 .addImm(IsA16 ? 1 : 0)
7472 MI.eraseFromParent();
7482 Register DstOrigin =
MI.getOperand(1).getReg();
7484 Register NodePtr =
MI.getOperand(4).getReg();
7485 Register RayExtent =
MI.getOperand(5).getReg();
7486 Register InstanceMask =
MI.getOperand(6).getReg();
7487 Register RayOrigin =
MI.getOperand(7).getReg();
7489 Register Offsets =
MI.getOperand(9).getReg();
7490 Register TDescr =
MI.getOperand(10).getReg();
7492 if (!ST.hasBVHDualAndBVH8Insts()) {
7495 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7500 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7501 const unsigned NumVDataDwords = 10;
7502 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7504 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7505 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7506 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7509 auto RayExtentInstanceMaskVec =
B.buildMergeLikeInstr(
7510 V2S32, {RayExtent,
B.buildAnyExt(
S32, InstanceMask)});
7512 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7513 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7519 .addUse(RayExtentInstanceMaskVec.getReg(0))
7526 MI.eraseFromParent();
7535 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7536 MI.eraseFromParent();
7543 if (!ST.hasArchitectedSGPRs())
7547 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
7548 auto LSB =
B.buildConstant(
S32, 25);
7549 auto Width =
B.buildConstant(
S32, 5);
7550 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7551 MI.eraseFromParent();
7559 unsigned Width)
const {
7562 if (!
MRI.getRegClassOrNull(DstReg))
7563 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7564 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7567 MI.eraseFromParent();
7581 if (
MRI.getType(Src) !=
S64)
7585 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7589 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7592 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7593 MI.eraseFromParent();
7601 if (
MRI.getType(Src) !=
S64)
7604 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
7608 .addReg(Unmerge.getReg(0));
7612 .addReg(Unmerge.getReg(1));
7613 MI.eraseFromParent();
7625 case Intrinsic::amdgcn_if:
7626 case Intrinsic::amdgcn_else: {
7629 bool Negated =
false;
7641 std::swap(CondBrTarget, UncondBrTarget);
7643 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7644 if (IntrID == Intrinsic::amdgcn_if) {
7645 B.buildInstr(AMDGPU::SI_IF)
7648 .addMBB(UncondBrTarget);
7650 B.buildInstr(AMDGPU::SI_ELSE)
7653 .addMBB(UncondBrTarget);
7662 B.buildBr(*CondBrTarget);
7665 MRI.setRegClass(Def,
TRI->getWaveMaskRegClass());
7666 MRI.setRegClass(
Use,
TRI->getWaveMaskRegClass());
7667 MI.eraseFromParent();
7668 BrCond->eraseFromParent();
7674 case Intrinsic::amdgcn_loop: {
7677 bool Negated =
false;
7687 std::swap(CondBrTarget, UncondBrTarget);
7689 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7690 B.buildInstr(AMDGPU::SI_LOOP)
7692 .addMBB(UncondBrTarget);
7697 B.buildBr(*CondBrTarget);
7699 MI.eraseFromParent();
7700 BrCond->eraseFromParent();
7701 MRI.setRegClass(Reg,
TRI->getWaveMaskRegClass());
7707 case Intrinsic::amdgcn_addrspacecast_nonnull:
7709 case Intrinsic::amdgcn_make_buffer_rsrc:
7711 case Intrinsic::amdgcn_kernarg_segment_ptr:
7714 B.buildConstant(
MI.getOperand(0).getReg(), 0);
7715 MI.eraseFromParent();
7721 case Intrinsic::amdgcn_implicitarg_ptr:
7723 case Intrinsic::amdgcn_workitem_id_x:
7726 case Intrinsic::amdgcn_workitem_id_y:
7729 case Intrinsic::amdgcn_workitem_id_z:
7732 case Intrinsic::amdgcn_workgroup_id_x:
7737 case Intrinsic::amdgcn_workgroup_id_y:
7742 case Intrinsic::amdgcn_workgroup_id_z:
7747 case Intrinsic::amdgcn_cluster_id_x:
7748 return ST.hasClusters() &&
7751 case Intrinsic::amdgcn_cluster_id_y:
7752 return ST.hasClusters() &&
7755 case Intrinsic::amdgcn_cluster_id_z:
7756 return ST.hasClusters() &&
7759 case Intrinsic::amdgcn_cluster_workgroup_id_x:
7760 return ST.hasClusters() &&
7763 case Intrinsic::amdgcn_cluster_workgroup_id_y:
7764 return ST.hasClusters() &&
7767 case Intrinsic::amdgcn_cluster_workgroup_id_z:
7768 return ST.hasClusters() &&
7771 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
7772 return ST.hasClusters() &&
7774 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
7775 return ST.hasClusters() &&
7778 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
7779 return ST.hasClusters() &&
7782 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
7783 return ST.hasClusters() &&
7786 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
7787 return ST.hasClusters() &&
7791 case Intrinsic::amdgcn_wave_id:
7793 case Intrinsic::amdgcn_lds_kernel_id:
7796 case Intrinsic::amdgcn_dispatch_ptr:
7799 case Intrinsic::amdgcn_queue_ptr:
7802 case Intrinsic::amdgcn_implicit_buffer_ptr:
7805 case Intrinsic::amdgcn_dispatch_id:
7808 case Intrinsic::r600_read_ngroups_x:
7812 case Intrinsic::r600_read_ngroups_y:
7815 case Intrinsic::r600_read_ngroups_z:
7818 case Intrinsic::r600_read_local_size_x:
7821 case Intrinsic::r600_read_local_size_y:
7825 case Intrinsic::r600_read_local_size_z:
7828 case Intrinsic::amdgcn_fdiv_fast:
7830 case Intrinsic::amdgcn_is_shared:
7832 case Intrinsic::amdgcn_is_private:
7834 case Intrinsic::amdgcn_wavefrontsize: {
7835 B.buildConstant(
MI.getOperand(0), ST.getWavefrontSize());
7836 MI.eraseFromParent();
7839 case Intrinsic::amdgcn_s_buffer_load:
7841 case Intrinsic::amdgcn_raw_buffer_store:
7842 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7843 case Intrinsic::amdgcn_struct_buffer_store:
7844 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7846 case Intrinsic::amdgcn_raw_buffer_store_format:
7847 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7848 case Intrinsic::amdgcn_struct_buffer_store_format:
7849 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7851 case Intrinsic::amdgcn_raw_tbuffer_store:
7852 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7853 case Intrinsic::amdgcn_struct_tbuffer_store:
7854 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7856 case Intrinsic::amdgcn_raw_buffer_load:
7857 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7858 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7859 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7860 case Intrinsic::amdgcn_struct_buffer_load:
7861 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7862 case Intrinsic::amdgcn_struct_atomic_buffer_load:
7863 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7865 case Intrinsic::amdgcn_raw_buffer_load_format:
7866 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7867 case Intrinsic::amdgcn_struct_buffer_load_format:
7868 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7870 case Intrinsic::amdgcn_raw_tbuffer_load:
7871 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7872 case Intrinsic::amdgcn_struct_tbuffer_load:
7873 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7875 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7876 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7877 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7878 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7879 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7880 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7881 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7882 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7883 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7884 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7885 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7886 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7887 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7888 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7889 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7890 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7891 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7892 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7893 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7894 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7895 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7896 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7897 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7898 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7899 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7900 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7901 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7902 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7903 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7904 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7905 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7906 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7907 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7908 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7909 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7910 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7911 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7913 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7914 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7915 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7916 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7917 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7918 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7919 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7920 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7921 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7922 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7923 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7924 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7925 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7926 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7927 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7928 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7929 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7930 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7931 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7932 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7933 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7934 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7935 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7936 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7937 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7938 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7940 case Intrinsic::amdgcn_rsq_clamp:
7942 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7944 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
7945 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
7947 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
7948 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
7949 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
7950 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
7951 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
7952 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
7953 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
7954 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
7957 if (
MRI.getType(Index) !=
S64)
7958 MI.getOperand(5).setReg(
B.buildAnyExt(
S64, Index).getReg(0));
7961 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7962 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7963 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7964 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7965 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7966 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7967 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7968 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7971 if (
MRI.getType(Index) !=
S32)
7972 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
7975 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
7976 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
7977 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
7978 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
7979 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
7980 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
7981 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7982 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7983 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7985 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
7988 if (
MRI.getType(Index) != IdxTy)
7989 MI.getOperand(7).setReg(
B.buildAnyExt(IdxTy, Index).getReg(0));
7993 case Intrinsic::amdgcn_fmed3: {
7999 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8000 MI.removeOperand(1);
8004 case Intrinsic::amdgcn_readlane:
8005 case Intrinsic::amdgcn_writelane:
8006 case Intrinsic::amdgcn_readfirstlane:
8007 case Intrinsic::amdgcn_permlane16:
8008 case Intrinsic::amdgcn_permlanex16:
8009 case Intrinsic::amdgcn_permlane64:
8010 case Intrinsic::amdgcn_set_inactive:
8011 case Intrinsic::amdgcn_set_inactive_chain_arg:
8012 case Intrinsic::amdgcn_mov_dpp8:
8013 case Intrinsic::amdgcn_update_dpp:
8015 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8017 case Intrinsic::amdgcn_dead: {
8021 MI.eraseFromParent();
8024 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8025 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8026 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8027 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8028 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8029 MI.eraseFromParent();
8031 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8032 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8033 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8034 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8035 B.buildStore(
MI.getOperand(2),
MI.getOperand(1), **
MI.memoperands_begin());
8036 MI.eraseFromParent();
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS32Vectors
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS64Vectors
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static constexpr unsigned MaxRegisterSize
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isEntryFunction() const
bool isModuleEntryFunction() const
const std::array< unsigned, 3 > & getDims() const
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ ICMP_ULT
unsigned less than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & lowerIf(LegalityPredicate Predicate)
The instruction is lowered if predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
The instances of the Type class are immutable: once they are created, they are never changed.
A Use represents the edge between a Value definition and its users.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX1250(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LLVM_ABI LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
int popcount(T Value) noexcept
Count the number of set bits in a value.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool has_single_bit(T Value) noexcept
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
To bit_cast(const From &from) noexcept
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.