27#include "llvm/IR/IntrinsicsAMDGPU.h"
34#define DEBUG_TYPE "AMDGPUtti"
37 "amdgpu-unroll-threshold-private",
38 cl::desc(
"Unroll threshold for AMDGPU if private memory used in a loop"),
42 "amdgpu-unroll-threshold-local",
43 cl::desc(
"Unroll threshold for AMDGPU if local memory used in a loop"),
47 "amdgpu-unroll-threshold-if",
48 cl::desc(
"Unroll threshold increment for AMDGPU for each if statement inside loop"),
52 "amdgpu-unroll-runtime-local",
53 cl::desc(
"Allow runtime unroll for AMDGPU if local memory used in a loop"),
57 "amdgpu-unroll-max-block-to-analyze",
58 cl::desc(
"Inner loop block size threshold to analyze in unroll for AMDGPU"),
63 cl::desc(
"Cost of alloca argument"));
71 cl::desc(
"Maximum alloca size to use for inline cost"));
76 cl::desc(
"Maximum number of BBs allowed in a function after inlining"
77 " (compile time constraint)"));
81 "amdgpu-memcpy-loop-unroll",
82 cl::desc(
"Unroll factor (affecting 4x32-bit operations) to use for memory "
83 "operations when lowering memcpy as a loop"),
92 for (
const Value *V :
I->operand_values()) {
95 if (
const PHINode *
PHI = dyn_cast<PHINode>(V)) {
97 return SubLoop->contains(PHI); }))
106 :
BaseT(TM,
F.getDataLayout()),
107 TargetTriple(TM->getTargetTriple()),
109 TLI(ST->getTargetLowering()) {}
114 const Function &
F = *L->getHeader()->getParent();
116 F.getFnAttributeAsParsedInteger(
"amdgpu-unroll-threshold", 300);
117 UP.
MaxCount = std::numeric_limits<unsigned>::max();
130 const unsigned MaxAlloca = (256 - 16) * 4;
136 if (
MDNode *LoopUnrollThreshold =
138 if (LoopUnrollThreshold->getNumOperands() == 2) {
139 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
140 LoopUnrollThreshold->getOperand(1));
141 if (MetaThresholdValue) {
147 ThresholdPrivate = std::min(ThresholdPrivate, UP.
Threshold);
148 ThresholdLocal = std::min(ThresholdLocal, UP.
Threshold);
153 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
156 unsigned LocalGEPsSeen = 0;
159 return SubLoop->contains(BB); }))
168 if (
const BranchInst *Br = dyn_cast<BranchInst>(&
I)) {
169 if (UP.
Threshold < MaxBoost && Br->isConditional()) {
172 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
173 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
179 << *L <<
" due to " << *Br <<
'\n');
191 unsigned AS =
GEP->getAddressSpace();
192 unsigned Threshold = 0;
194 Threshold = ThresholdPrivate;
196 Threshold = ThresholdLocal;
211 if (AllocaSize > MaxAlloca)
220 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2)
224 if (!isa<GlobalVariable>(V) && !isa<Argument>(V))
228 << *L <<
" due to LDS use.\n");
233 bool HasLoopDef =
false;
236 if (!Inst || L->isLoopInvariant(
Op))
240 return SubLoop->contains(Inst); }))
264 << *L <<
" due to " << *
GEP <<
'\n');
287 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
288 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
289 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
290 AMDGPU::FeatureUnalignedAccessMode,
292 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
295 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
296 AMDGPU::FeatureTrapHandler,
300 AMDGPU::FeatureSRAMECC,
303 AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
306 :
BaseT(TM,
F.getDataLayout()),
308 TLI(ST->getTargetLowering()), CommonTTI(TM,
F),
309 IsGraphics(AMDGPU::isGraphics(
F.getCallingConv())) {
312 HasFP64FP16Denormals =
349 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
350 return 32 * 4 / ElemWidth;
360 unsigned ChainSizeInBytes,
362 unsigned VecRegBitWidth = VF * LoadSize;
365 return 128 / LoadSize;
371 unsigned ChainSizeInBytes,
373 unsigned VecRegBitWidth = VF * StoreSize;
374 if (VecRegBitWidth > 128)
375 return 128 / StoreSize;
399 unsigned AddrSpace)
const {
412 unsigned AddrSpace)
const {
418 unsigned AddrSpace)
const {
428 unsigned DestAddrSpace,
Align SrcAlign,
Align DestAlign,
429 std::optional<uint32_t> AtomicElementSize)
const {
431 if (AtomicElementSize)
445 unsigned I32EltsInVector = 4;
455 unsigned RemainingBytes,
unsigned SrcAddrSpace,
unsigned DestAddrSpace,
457 std::optional<uint32_t> AtomicCpySize)
const {
461 OpsOut,
Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
462 DestAlign, AtomicCpySize);
465 while (RemainingBytes >= 16) {
467 RemainingBytes -= 16;
471 while (RemainingBytes >= 8) {
477 while (RemainingBytes >= 4) {
483 while (RemainingBytes >= 2) {
489 while (RemainingBytes) {
507 case Intrinsic::amdgcn_ds_ordered_add:
508 case Intrinsic::amdgcn_ds_ordered_swap: {
509 auto *Ordering = dyn_cast<ConstantInt>(Inst->
getArgOperand(2));
510 auto *Volatile = dyn_cast<ConstantInt>(Inst->
getArgOperand(4));
511 if (!Ordering || !Volatile)
514 unsigned OrderingVal = Ordering->getZExtValue();
521 Info.WriteMem =
true;
522 Info.IsVolatile = !Volatile->isZero();
536 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
541 unsigned NElts = LT.second.isVector() ?
542 LT.second.getVectorNumElements() : 1;
551 return get64BitInstrCost(
CostKind) * LT.first * NElts;
554 NElts = (NElts + 1) / 2;
557 return getFullRateInstrCost() * LT.first * NElts;
563 if (SLT == MVT::i64) {
565 return 2 * getFullRateInstrCost() * LT.first * NElts;
569 NElts = (NElts + 1) / 2;
571 return LT.first * NElts * getFullRateInstrCost();
573 const int QuarterRateCost = getQuarterRateInstrCost(
CostKind);
574 if (SLT == MVT::i64) {
575 const int FullRateCost = getFullRateInstrCost();
576 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
580 NElts = (NElts + 1) / 2;
583 return QuarterRateCost * NElts * LT.first;
590 if (
const auto *
FAdd = dyn_cast<BinaryOperator>(*CxtI->
user_begin())) {
595 if (ST->
has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
609 NElts = (NElts + 1) / 2;
611 return LT.first * NElts * get64BitInstrCost(
CostKind);
614 NElts = (NElts + 1) / 2;
616 if (SLT == MVT::f32 || SLT == MVT::f16)
617 return LT.first * NElts * getFullRateInstrCost();
623 if (SLT == MVT::f64) {
629 Cost += 3 * getFullRateInstrCost();
631 return LT.first *
Cost * NElts;
636 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
638 return LT.first * getQuarterRateInstrCost(
CostKind) * NElts;
649 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(
CostKind);
650 return LT.first *
Cost * NElts;
657 int Cost = getQuarterRateInstrCost(
CostKind) + getFullRateInstrCost();
658 return LT.first *
Cost * NElts;
661 if (SLT == MVT::f32 || SLT == MVT::f16) {
663 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
664 1 * getQuarterRateInstrCost(
CostKind);
666 if (!HasFP32Denormals) {
668 Cost += 2 * getFullRateInstrCost();
671 return LT.first * NElts *
Cost;
691 case Intrinsic::fmuladd:
692 case Intrinsic::copysign:
693 case Intrinsic::minimumnum:
694 case Intrinsic::maximumnum:
695 case Intrinsic::canonicalize:
697 case Intrinsic::round:
698 case Intrinsic::uadd_sat:
699 case Intrinsic::usub_sat:
700 case Intrinsic::sadd_sat:
701 case Intrinsic::ssub_sat:
712 switch (ICA.
getID()) {
713 case Intrinsic::fabs:
716 case Intrinsic::amdgcn_workitem_id_x:
717 case Intrinsic::amdgcn_workitem_id_y:
718 case Intrinsic::amdgcn_workitem_id_z:
722 case Intrinsic::amdgcn_workgroup_id_x:
723 case Intrinsic::amdgcn_workgroup_id_y:
724 case Intrinsic::amdgcn_workgroup_id_z:
725 case Intrinsic::amdgcn_lds_kernel_id:
726 case Intrinsic::amdgcn_dispatch_ptr:
727 case Intrinsic::amdgcn_dispatch_id:
728 case Intrinsic::amdgcn_implicitarg_ptr:
729 case Intrinsic::amdgcn_queue_ptr:
742 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(
RetTy);
744 unsigned NElts = LT.second.isVector() ?
745 LT.second.getVectorNumElements() : 1;
749 if ((ST->
hasVOP3PInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) ||
751 NElts = (NElts + 1) / 2;
754 unsigned InstRate = getQuarterRateInstrCost(
CostKind);
756 switch (ICA.
getID()) {
758 case Intrinsic::fmuladd:
759 if (SLT == MVT::f64) {
760 InstRate = get64BitInstrCost(
CostKind);
764 if ((SLT == MVT::f32 && ST->
hasFastFMAF32()) || SLT == MVT::f16)
765 InstRate = getFullRateInstrCost();
768 : getQuarterRateInstrCost(
CostKind);
771 case Intrinsic::copysign:
772 return NElts * getFullRateInstrCost();
773 case Intrinsic::minimumnum:
774 case Intrinsic::maximumnum: {
786 SLT == MVT::f64 ? get64BitInstrCost(
CostKind) : getFullRateInstrCost();
787 InstRate = BaseRate * NumOps;
790 case Intrinsic::canonicalize: {
792 SLT == MVT::f64 ? get64BitInstrCost(
CostKind) : getFullRateInstrCost();
795 case Intrinsic::uadd_sat:
796 case Intrinsic::usub_sat:
797 case Intrinsic::sadd_sat:
798 case Intrinsic::ssub_sat: {
799 if (SLT == MVT::i16 || SLT == MVT::i32)
800 InstRate = getFullRateInstrCost();
802 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
803 if (
any_of(ValidSatTys, [<](
MVT M) {
return M == LT.second; }))
809 if (SLT == MVT::i16 || SLT == MVT::i32)
810 InstRate = 2 * getFullRateInstrCost();
816 return LT.first * NElts * InstRate;
822 assert((
I ==
nullptr ||
I->getOpcode() == Opcode) &&
823 "Opcode should reflect passed instruction.");
826 const int CBrCost = SCost ? 5 : 7;
828 case Instruction::Br: {
830 const auto *BI = dyn_cast_or_null<BranchInst>(
I);
831 if (BI && BI->isUnconditional())
832 return SCost ? 1 : 4;
837 case Instruction::Switch: {
838 const auto *SI = dyn_cast_or_null<SwitchInst>(
I);
841 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
843 case Instruction::Ret:
844 return SCost ? 1 : 10;
851 std::optional<FastMathFlags> FMF,
863 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
864 return LT.first * getFullRateInstrCost();
878 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
879 return LT.first * getHalfRateInstrCost(
CostKind);
884 unsigned Index,
const Value *Op0,
885 const Value *Op1)
const {
887 case Instruction::ExtractElement:
888 case Instruction::InsertElement: {
903 return Index == ~0u ? 2 : 0;
917 if (Indices.
size() > 1)
925 const int TargetOutputIdx = Indices.
empty() ? -1 : Indices[0];
928 for (
auto &TC : TargetConstraints) {
933 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
939 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
943 if (!RC || !
TRI->isSGPRClass(RC))
953 cast<MetadataAsValue>(ReadReg->
getArgOperand(0))->getMetadata();
955 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
974 if (
const Argument *
A = dyn_cast<Argument>(V))
983 if (
const LoadInst *Load = dyn_cast<LoadInst>(V))
991 if (isa<AtomicRMWInst, AtomicCmpXchgInst>(V))
994 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
997 case Intrinsic::read_register:
999 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1001 Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
1002 unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
1007 case Intrinsic::amdgcn_workitem_id_y:
1008 case Intrinsic::amdgcn_workitem_id_z: {
1009 const Function *
F = Intrinsic->getFunction();
1013 *
F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1014 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1022 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
1023 if (CI->isInlineAsm())
1029 if (isa<InvokeInst>(V))
1035 if (
auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {
1045 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
1048 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
1049 if (CI->isInlineAsm())
1067 bool XDimDoesntResetWithinWaves =
false;
1068 if (
auto *
I = dyn_cast<Instruction>(V)) {
1074 if (
match(V,
m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1076 match(V,
m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1082 if (
match(V,
m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1086 XDimDoesntResetWithinWaves;
1097 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1098 switch (Intrinsic->getIntrinsicID()) {
1101 case Intrinsic::amdgcn_if:
1102 case Intrinsic::amdgcn_else: {
1104 return Indices.
size() == 1 && Indices[0] == 1;
1121 case Intrinsic::amdgcn_is_shared:
1122 case Intrinsic::amdgcn_is_private:
1123 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1124 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1125 case Intrinsic::amdgcn_load_to_lds:
1126 case Intrinsic::amdgcn_make_buffer_rsrc:
1136 Value *NewV)
const {
1137 auto IntrID =
II->getIntrinsicID();
1139 case Intrinsic::amdgcn_is_shared:
1140 case Intrinsic::amdgcn_is_private: {
1141 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1149 case Intrinsic::ptrmask: {
1152 Value *MaskOp =
II->getArgOperand(1);
1155 bool DoTruncate =
false;
1159 if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1177 MaskTy =
B.getInt32Ty();
1178 MaskOp =
B.CreateTrunc(MaskOp, MaskTy);
1181 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->
getType(), MaskTy},
1184 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1185 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1186 Type *DestTy =
II->getType();
1193 M,
II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
1194 II->setArgOperand(0, NewV);
1195 II->setCalledFunction(NewDecl);
1198 case Intrinsic::amdgcn_load_to_lds: {
1203 II->setArgOperand(0, NewV);
1204 II->setCalledFunction(NewDecl);
1207 case Intrinsic::amdgcn_make_buffer_rsrc: {
1209 Type *DstTy =
II->getType();
1212 M,
II->getIntrinsicID(), {DstTy, SrcTy});
1213 II->setArgOperand(0, NewV);
1214 II->setCalledFunction(NewDecl);
1229 if (!isa<FixedVectorType>(SrcTy))
1237 (ScalarSize == 16 || ScalarSize == 8)) {
1240 unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
1241 unsigned RequestedElts =
1242 count_if(Mask, [](
int MaskElt) {
return MaskElt != -1; });
1243 unsigned EltsPerReg = 32 / ScalarSize;
1244 if (RequestedElts == 0)
1252 if (ST->
hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
1254 unsigned NumPerms =
alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
1257 return NumPerms + NumPermMasks;
1266 return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
1271 unsigned NumPerms =
alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
1274 return NumPerms + NumPermMasks;
1291 using namespace PatternMatch;
1293 for (
auto &
Op :
I->operands()) {
1295 if (
any_of(Ops, [&](
Use *U) {
return U->get() ==
Op.get(); }))
1302 return !Ops.
empty();
1309 =
static_cast<const GCNSubtarget *
>(TM.getSubtargetImpl(*Caller));
1311 =
static_cast<const GCNSubtarget *
>(TM.getSubtargetImpl(*Callee));
1313 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1314 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1316 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1317 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1318 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1328 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1329 Callee->hasFnAttribute(Attribute::InlineHint))
1335 if (Callee->size() == 1)
1337 size_t BBSize = Caller->size() + Callee->size() - 1;
1347 const int NrOfSGPRUntilSpill = 26;
1348 const int NrOfVGPRUntilSpill = 32;
1352 unsigned adjustThreshold = 0;
1358 for (
auto ArgVT : ValueVTs) {
1362 SGPRsInUse += CCRegNum;
1364 VGPRsInUse += CCRegNum;
1374 ArgStackCost +=
const_cast<GCNTTIImpl *
>(TTIImpl)->getMemoryOpCost(
1377 ArgStackCost +=
const_cast<GCNTTIImpl *
>(TTIImpl)->getMemoryOpCost(
1383 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1385 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1387 return adjustThreshold;
1396 unsigned AllocaSize = 0;
1399 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1403 unsigned AddrSpace = Ty->getAddressSpace();
1456 static_assert(InlinerVectorBonusPercent == 0,
"vector bonus assumed to be 0");
1460 return BB.getTerminator()->getNumSuccessors() > 1;
1463 Threshold += Threshold / 2;
1469 unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
1471 return AllocaThresholdBonus;
1487 ? getFullRateInstrCost()
1488 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(
CostKind)
1489 : getQuarterRateInstrCost(
CostKind);
1492std::pair<InstructionCost, MVT>
1493GCNTTIImpl::getTypeLegalizationCost(
Type *Ty)
const {
1518 LB.
push_back({
"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1519 LB.push_back({
"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1520 LB.push_back({
"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1521 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1523 LB.push_back({
"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1524 LB.push_back({
"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1526 LB.push_back({
"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1527 LB.push_back({
"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1539 Attribute IEEEAttr =
F->getFnAttribute(
"amdgpu-ieee");
1553 if (
VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1554 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1555 VecTy->getElementType()->isIntegerTy(8)) {
1565 if (
VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1566 if (VecTy->getElementType()->isIntegerTy(8)) {
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Register const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasMadMacF32Insts() const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool hasFastFMAF32() const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
bool hasVOP3PInsts() const
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
bool isValid() const
Return true if the attribute is any kind of attribute.
LLVM Basic Block Representation.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
unsigned getNumberOfParts(Type *Tp) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Conditional or Unconditional Branch instruction.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool isInlineAsm() const
Check if this call is an inline asm statement.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
CallingConv::ID getCallingConv() const
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
unsigned getPointerSizeInBits(unsigned AS=0) const
The size in bits of the pointer representation in a given address space.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
constexpr bool isScalar() const
Exactly one element.
Convenience struct for specifying and reasoning about fast-math flags.
Container class for subtarget features.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
const SIRegisterInfo * getRegisterInfo() const override
bool hasGloballyAddressableScratch() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool hasUnalignedScratchAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Generation getGeneration() const
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Account for loads of i8 vector types to have reduced cost.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
unsigned getNumberOfRegisters(unsigned RCID) const override
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool shouldPrefetchAddressSpace(unsigned AS) const override
bool hasBranchDivergence(const Function *F=nullptr) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
unsigned getInliningThresholdMultiplier() const override
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
unsigned adjustInliningThreshold(const CallBase *CB) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool isAlwaysUniform(const Value *V) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
bool isSourceOfDivergence(const Value *V) const override
int getInliningLastCallToStaticBonus() const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
unsigned getNumberOfParts(Type *Tp) const override
When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
unsigned getMinVectorRegisterBitWidth() const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
LLVM_ABI bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Type * getReturnType() const
const IntrinsicInst * getInst() const
Intrinsic::ID getID() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
Represents a single loop in the control flow graph.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
A Module instance is used to store all the information related to an LLVM module.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
std::vector< AsmOperandInfo > AsmOperandInfoVector
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
Primary interface to the complete machine description for the target machine.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
A Use represents the edge between a Value definition and its users.
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Base class of all SIMD vector types.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
@ C
The default llvm calling convention, compatible with C.
@ ADD
Simple integer binary arithmetic operators.
@ FADD
Simple binary floating point operators.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
@ AND
Bitwise operators - logical and, logical or, logical xor.
LLVM_ABI int getInstrCost()
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
This struct is a compact representation of a valid (non-zero power of two) alignment.
static constexpr DenormalMode getPreserveSign()
uint64_t getScalarSizeInBits() const
unsigned countMinLeadingOnes() const
Returns the minimum number of leading one bits.
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const