26#include "llvm/IR/IntrinsicsAMDGPU.h"
33#define DEBUG_TYPE "AMDGPUtti"
36 "amdgpu-unroll-threshold-private",
37 cl::desc(
"Unroll threshold for AMDGPU if private memory used in a loop"),
41 "amdgpu-unroll-threshold-local",
42 cl::desc(
"Unroll threshold for AMDGPU if local memory used in a loop"),
46 "amdgpu-unroll-threshold-if",
47 cl::desc(
"Unroll threshold increment for AMDGPU for each if statement inside loop"),
51 "amdgpu-unroll-runtime-local",
52 cl::desc(
"Allow runtime unroll for AMDGPU if local memory used in a loop"),
56 "amdgpu-unroll-max-block-to-analyze",
57 cl::desc(
"Inner loop block size threshold to analyze in unroll for AMDGPU"),
62 cl::desc(
"Cost of alloca argument"));
70 cl::desc(
"Maximum alloca size to use for inline cost"));
75 cl::desc(
"Maximum number of BBs allowed in a function after inlining"
76 " (compile time constraint)"));
80 "amdgpu-memcpy-loop-unroll",
81 cl::desc(
"Unroll factor (affecting 4x32-bit operations) to use for memory "
82 "operations when lowering memcpy as a loop"),
91 for (
const Value *V :
I->operand_values()) {
94 if (
const PHINode *
PHI = dyn_cast<PHINode>(V)) {
96 return SubLoop->contains(PHI); }))
105 :
BaseT(TM,
F.getDataLayout()),
106 TargetTriple(TM->getTargetTriple()),
108 TLI(ST->getTargetLowering()) {}
113 const Function &
F = *L->getHeader()->getParent();
115 F.getFnAttributeAsParsedInteger(
"amdgpu-unroll-threshold", 300);
116 UP.
MaxCount = std::numeric_limits<unsigned>::max();
129 const unsigned MaxAlloca = (256 - 16) * 4;
135 if (
MDNode *LoopUnrollThreshold =
137 if (LoopUnrollThreshold->getNumOperands() == 2) {
138 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
139 LoopUnrollThreshold->getOperand(1));
140 if (MetaThresholdValue) {
146 ThresholdPrivate = std::min(ThresholdPrivate, UP.
Threshold);
147 ThresholdLocal = std::min(ThresholdLocal, UP.
Threshold);
152 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
155 unsigned LocalGEPsSeen = 0;
158 return SubLoop->contains(BB); }))
167 if (
const BranchInst *Br = dyn_cast<BranchInst>(&
I)) {
168 if (UP.
Threshold < MaxBoost && Br->isConditional()) {
171 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
172 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
178 << *L <<
" due to " << *Br <<
'\n');
190 unsigned AS =
GEP->getAddressSpace();
191 unsigned Threshold = 0;
193 Threshold = ThresholdPrivate;
195 Threshold = ThresholdLocal;
210 if (AllocaSize > MaxAlloca)
219 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
220 (!isa<GlobalVariable>(
GEP->getPointerOperand()) &&
221 !isa<Argument>(
GEP->getPointerOperand())))
224 << *L <<
" due to LDS use.\n");
229 bool HasLoopDef =
false;
232 if (!Inst || L->isLoopInvariant(
Op))
236 return SubLoop->contains(Inst); }))
260 << *L <<
" due to " << *
GEP <<
'\n');
283 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
284 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
285 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
286 AMDGPU::FeatureUnalignedAccessMode,
288 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
291 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
292 AMDGPU::FeatureTrapHandler,
296 AMDGPU::FeatureSRAMECC,
299 AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
302 :
BaseT(TM,
F.getDataLayout()),
304 TLI(ST->getTargetLowering()), CommonTTI(TM,
F),
305 IsGraphics(AMDGPU::isGraphics(
F.getCallingConv())) {
308 HasFP64FP16Denormals =
345 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346 return 32 * 4 / ElemWidth;
353 unsigned ChainSizeInBytes,
355 unsigned VecRegBitWidth = VF * LoadSize;
358 return 128 / LoadSize;
364 unsigned ChainSizeInBytes,
366 unsigned VecRegBitWidth = VF * StoreSize;
367 if (VecRegBitWidth > 128)
368 return 128 / StoreSize;
392 unsigned AddrSpace)
const {
405 unsigned AddrSpace)
const {
411 unsigned AddrSpace)
const {
421 unsigned DestAddrSpace,
Align SrcAlign,
Align DestAlign,
422 std::optional<uint32_t> AtomicElementSize)
const {
424 if (AtomicElementSize)
438 unsigned I32EltsInVector = 4;
448 unsigned RemainingBytes,
unsigned SrcAddrSpace,
unsigned DestAddrSpace,
450 std::optional<uint32_t> AtomicCpySize)
const {
454 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
455 DestAlign, AtomicCpySize);
458 while (RemainingBytes >= 16) {
460 RemainingBytes -= 16;
464 while (RemainingBytes >= 8) {
470 while (RemainingBytes >= 4) {
476 while (RemainingBytes >= 2) {
482 while (RemainingBytes) {
500 case Intrinsic::amdgcn_ds_ordered_add:
501 case Intrinsic::amdgcn_ds_ordered_swap: {
502 auto *Ordering = dyn_cast<ConstantInt>(Inst->
getArgOperand(2));
503 auto *Volatile = dyn_cast<ConstantInt>(Inst->
getArgOperand(4));
504 if (!Ordering || !Volatile)
507 unsigned OrderingVal = Ordering->getZExtValue();
514 Info.WriteMem =
true;
515 Info.IsVolatile = !Volatile->isZero();
530 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
535 unsigned NElts = LT.second.isVector() ?
536 LT.second.getVectorNumElements() : 1;
545 return get64BitInstrCost(
CostKind) * LT.first * NElts;
548 NElts = (NElts + 1) / 2;
551 return getFullRateInstrCost() * LT.first * NElts;
557 if (SLT == MVT::i64) {
559 return 2 * getFullRateInstrCost() * LT.first * NElts;
563 NElts = (NElts + 1) / 2;
565 return LT.first * NElts * getFullRateInstrCost();
567 const int QuarterRateCost = getQuarterRateInstrCost(
CostKind);
568 if (SLT == MVT::i64) {
569 const int FullRateCost = getFullRateInstrCost();
570 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
574 NElts = (NElts + 1) / 2;
577 return QuarterRateCost * NElts * LT.first;
584 if (
const auto *
FAdd = dyn_cast<BinaryOperator>(*CxtI->
user_begin())) {
589 if (ST->
has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
604 NElts = (NElts + 1) / 2;
606 return LT.first * NElts * get64BitInstrCost(
CostKind);
609 NElts = (NElts + 1) / 2;
611 if (SLT == MVT::f32 || SLT == MVT::f16)
612 return LT.first * NElts * getFullRateInstrCost();
618 if (SLT == MVT::f64) {
624 Cost += 3 * getFullRateInstrCost();
626 return LT.first *
Cost * NElts;
631 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
633 return LT.first * getQuarterRateInstrCost(
CostKind) * NElts;
644 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(
CostKind);
645 return LT.first *
Cost * NElts;
653 int Cost = getQuarterRateInstrCost(
CostKind) + getFullRateInstrCost();
654 return LT.first *
Cost * NElts;
657 if (SLT == MVT::f32 || SLT == MVT::f16) {
659 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
660 1 * getQuarterRateInstrCost(
CostKind);
662 if (!HasFP32Denormals) {
664 Cost += 2 * getFullRateInstrCost();
667 return LT.first * NElts *
Cost;
687 case Intrinsic::fmuladd:
688 case Intrinsic::copysign:
689 case Intrinsic::canonicalize:
691 case Intrinsic::round:
692 case Intrinsic::uadd_sat:
693 case Intrinsic::usub_sat:
694 case Intrinsic::sadd_sat:
695 case Intrinsic::ssub_sat:
706 if (ICA.
getID() == Intrinsic::fabs)
715 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(
RetTy);
717 unsigned NElts = LT.second.isVector() ?
718 LT.second.getVectorNumElements() : 1;
723 return LT.first * NElts * get64BitInstrCost(
CostKind);
725 if ((ST->
has16BitInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) ||
727 NElts = (NElts + 1) / 2;
730 unsigned InstRate = getQuarterRateInstrCost(
CostKind);
732 switch (ICA.
getID()) {
734 case Intrinsic::fmuladd:
735 if ((SLT == MVT::f32 && ST->
hasFastFMAF32()) || SLT == MVT::f16)
736 InstRate = getFullRateInstrCost();
739 : getQuarterRateInstrCost(
CostKind);
742 case Intrinsic::copysign:
743 return NElts * getFullRateInstrCost();
744 case Intrinsic::canonicalize: {
746 InstRate = getFullRateInstrCost();
749 case Intrinsic::uadd_sat:
750 case Intrinsic::usub_sat:
751 case Intrinsic::sadd_sat:
752 case Intrinsic::ssub_sat: {
753 if (SLT == MVT::i16 || SLT == MVT::i32)
754 InstRate = getFullRateInstrCost();
756 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
757 if (
any_of(ValidSatTys, [<](
MVT M) {
return M == LT.second; }))
763 if (SLT == MVT::i16 || SLT == MVT::i32)
764 InstRate = 2 * getFullRateInstrCost();
770 return LT.first * NElts * InstRate;
776 assert((
I ==
nullptr ||
I->getOpcode() == Opcode) &&
777 "Opcode should reflect passed instruction.");
780 const int CBrCost = SCost ? 5 : 7;
782 case Instruction::Br: {
784 const auto *BI = dyn_cast_or_null<BranchInst>(
I);
785 if (BI && BI->isUnconditional())
786 return SCost ? 1 : 4;
791 case Instruction::Switch: {
792 const auto *SI = dyn_cast_or_null<SwitchInst>(
I);
795 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
797 case Instruction::Ret:
798 return SCost ? 1 : 10;
805 std::optional<FastMathFlags> FMF,
817 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
818 return LT.first * getFullRateInstrCost();
832 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
833 return LT.first * getHalfRateInstrCost(
CostKind);
838 unsigned Index,
Value *Op0,
841 case Instruction::ExtractElement:
842 case Instruction::InsertElement: {
857 return Index == ~0u ? 2 : 0;
871 if (Indices.
size() > 1)
879 const int TargetOutputIdx = Indices.
empty() ? -1 : Indices[0];
882 for (
auto &TC : TargetConstraints) {
887 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
893 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
897 if (!RC || !
TRI->isSGPRClass(RC))
907 cast<MetadataAsValue>(ReadReg->
getArgOperand(0))->getMetadata();
909 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
928 if (
const Argument *
A = dyn_cast<Argument>(V))
937 if (
const LoadInst *Load = dyn_cast<LoadInst>(V))
945 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
948 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
949 if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
956 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
957 if (CI->isInlineAsm())
963 if (isa<InvokeInst>(V))
970 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
973 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
974 if (CI->isInlineAsm())
992 if (
match(V,
m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
994 match(V,
m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
996 const Function *
F = cast<Instruction>(V)->getFunction();
1002 if (
match(V,
m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1004 const Function *
F = cast<Instruction>(V)->getFunction();
1019 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1020 switch (Intrinsic->getIntrinsicID()) {
1023 case Intrinsic::amdgcn_if:
1024 case Intrinsic::amdgcn_else: {
1026 return Indices.
size() == 1 && Indices[0] == 1;
1043 case Intrinsic::amdgcn_is_shared:
1044 case Intrinsic::amdgcn_is_private:
1045 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1046 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1056 Value *NewV)
const {
1057 auto IntrID =
II->getIntrinsicID();
1059 case Intrinsic::amdgcn_is_shared:
1060 case Intrinsic::amdgcn_is_private: {
1061 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1069 case Intrinsic::ptrmask: {
1072 Value *MaskOp =
II->getArgOperand(1);
1075 bool DoTruncate =
false;
1079 if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1097 MaskTy =
B.getInt32Ty();
1098 MaskOp =
B.CreateTrunc(MaskOp, MaskTy);
1101 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->
getType(), MaskTy},
1104 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1105 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1106 Type *DestTy =
II->getType();
1113 M,
II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
1114 II->setArgOperand(0, NewV);
1115 II->setCalledFunction(NewDecl);
1129 if (!isa<FixedVectorType>(VT))
1136 unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
1140 unsigned RequestedElts =
1141 count_if(Mask, [](
int MaskElt) {
return MaskElt != -1; });
1142 if (RequestedElts == 0)
1150 if (HasVOP3P && NumVectorElts == 2)
1152 unsigned NumPerms =
alignTo(RequestedElts, 2) / 2;
1155 return NumPerms + NumPermMasks;
1164 return alignTo(RequestedElts, 2) / 2;
1169 unsigned NumPerms =
alignTo(RequestedElts, 2) / 2;
1172 return NumPerms + NumPermMasks;
1188 using namespace PatternMatch;
1190 for (
auto &
Op :
I->operands()) {
1192 if (
any_of(Ops, [&](
Use *U) {
return U->get() ==
Op.get(); }))
1199 return !Ops.
empty();
1206 =
static_cast<const GCNSubtarget *
>(TM.getSubtargetImpl(*Caller));
1208 =
static_cast<const GCNSubtarget *
>(TM.getSubtargetImpl(*Callee));
1210 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1211 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1213 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1214 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1215 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1225 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1226 Callee->hasFnAttribute(Attribute::InlineHint))
1232 if (Callee->size() == 1)
1234 size_t BBSize = Caller->size() + Callee->size() - 1;
1244 const int NrOfSGPRUntilSpill = 26;
1245 const int NrOfVGPRUntilSpill = 32;
1249 unsigned adjustThreshold = 0;
1255 for (
auto ArgVT : ValueVTs) {
1259 SGPRsInUse += CCRegNum;
1261 VGPRsInUse += CCRegNum;
1271 ArgStackCost +=
const_cast<GCNTTIImpl *
>(TTIImpl)->getMemoryOpCost(
1274 ArgStackCost +=
const_cast<GCNTTIImpl *
>(TTIImpl)->getMemoryOpCost(
1280 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1282 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1284 return adjustThreshold;
1293 unsigned AllocaSize = 0;
1296 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1300 unsigned AddrSpace = Ty->getAddressSpace();
1353 static_assert(InlinerVectorBonusPercent == 0,
"vector bonus assumed to be 0");
1357 return BB.getTerminator()->getNumSuccessors() > 1;
1360 Threshold += Threshold / 2;
1366 unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
1368 return AllocaThresholdBonus;
1384 ? getFullRateInstrCost()
1385 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(
CostKind)
1386 : getQuarterRateInstrCost(
CostKind);
1389std::pair<InstructionCost, MVT>
1390GCNTTIImpl::getTypeLegalizationCost(
Type *Ty)
const {
1415 LB.
push_back({
"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1416 LB.push_back({
"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1417 LB.push_back({
"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1418 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1420 LB.push_back({
"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1421 LB.push_back({
"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1423 LB.push_back({
"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1424 LB.push_back({
"amdgpu-waves-per-eu[1]", WavesPerEU.second});
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
unsigned const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool hasMadMacF32Insts() const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool hasFastFMAF32() const
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
bool hasVOP3PInsts() const
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
int64_t getMaxMemIntrinsicInlineSizeThreshold() const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
an instruction to allocate memory on the stack
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
LLVM Basic Block Representation.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Conditional or Unconditional Branch instruction.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool isInlineAsm() const
Check if this call is an inline asm statement.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
CallingConv::ID getCallingConv() const
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
static ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getFalse(LLVMContext &Context)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
constexpr bool isScalar() const
Exactly one element.
Convenience struct for specifying and reasoning about fast-math flags.
Container class for subtarget features.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
const SIRegisterInfo * getRegisterInfo() const override
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool hasUnalignedScratchAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Generation getGeneration() const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isAlwaysUniform(const Value *V) const
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
int64_t getMaxMemIntrinsicInlineSizeThreshold() const
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
int getInliningLastCallToStaticBonus() const
unsigned getNumberOfRegisters(unsigned RCID) const
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool shouldPrefetchAddressSpace(unsigned AS) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getMaxInterleaveFactor(ElementCount VF)
unsigned getInliningThresholdMultiplier() const
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
unsigned getMinVectorRegisterBitWidth() const
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
unsigned adjustInliningThreshold(const CallBase *CB) const
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
bool isSourceOfDivergence(const Value *V) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool hasBranchDivergence(const Function *F=nullptr) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Type * getReturnType() const
Intrinsic::ID getID() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
Represents a single loop in the control flow graph.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
A Module instance is used to store all the information related to an LLVM module.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
std::vector< AsmOperandInfo > AsmOperandInfoVector
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
Primary interface to the complete machine description for the target machine.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
static IntegerType * getInt16Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
static IntegerType * getInt8Ty(LLVMContext &C)
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
A Use represents the edge between a Value definition and its users.
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVMContext & getContext() const
All values hold a context through their type.
Base class of all SIMD vector types.
Type * getElementType() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
bool isFlatGlobalAddrSpace(unsigned AS)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
@ C
The default llvm calling convention, compatible with C.
@ ADD
Simple integer binary arithmetic operators.
@ FADD
Simple binary floating point operators.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
@ AND
Bitwise operators - logical and, logical or, logical xor.
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
AtomicOrdering
Atomic ordering for LLVM's memory model.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
This struct is a compact representation of a valid (non-zero power of two) alignment.
static constexpr DenormalMode getPreserveSign()
uint64_t getScalarSizeInBits() const
unsigned countMinLeadingOnes() const
Returns the minimum number of leading one bits.
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const