LLVM 22.0.0git
AMDGPUTargetTransformInfo.h
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file a TargetTransformInfoImplBase conforming object specific to the
11/// AMDGPU target machine. It uses the target's detailed information to
12/// provide more precise answers to certain TTI queries, while letting the
13/// target independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
18#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19
20#include "AMDGPU.h"
23#include <optional>
24
25namespace llvm {
26
27class AMDGPUTargetMachine;
28class GCNSubtarget;
29class InstCombiner;
30class Loop;
31class ScalarEvolution;
32class SITargetLowering;
33class Type;
34class Value;
35
36class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
39
40 friend BaseT;
41
42 Triple TargetTriple;
43
44 const TargetSubtargetInfo *ST;
45 const TargetLoweringBase *TLI;
46
47 const TargetSubtargetInfo *getST() const { return ST; }
48 const TargetLoweringBase *getTLI() const { return TLI; }
49
50public:
51 explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
52
55 OptimizationRemarkEmitter *ORE) const override;
56
58 TTI::PeelingPreferences &PP) const override;
59
61};
62
63class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
66
67 friend BaseT;
68
69 const GCNSubtarget *ST;
70 const SITargetLowering *TLI;
71 AMDGPUTTIImpl CommonTTI;
72 bool IsGraphics;
73 bool HasFP32Denormals;
74 bool HasFP64FP16Denormals;
75 static constexpr bool InlinerVectorBonusPercent = 0;
76
77 static const FeatureBitset InlineFeatureIgnoreList;
78
79 const GCNSubtarget *getST() const { return ST; }
80 const SITargetLowering *getTLI() const { return TLI; }
81
82 static inline int getFullRateInstrCost() {
84 }
85
86 static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) {
87 return CostKind == TTI::TCK_CodeSize ? 2
89 }
90
91 // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
92 // should be 2 or 4.
93 static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) {
94 return CostKind == TTI::TCK_CodeSize ? 2
96 }
97
98 // On some parts, normal fp64 operations are half rate, and others
99 // quarter. This also applies to some integer operations.
100 int get64BitInstrCost(TTI::TargetCostKind CostKind) const;
101
102 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const;
103
104public:
105 explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
106
107 bool hasBranchDivergence(const Function *F = nullptr) const override;
108
111 OptimizationRemarkEmitter *ORE) const override;
112
114 TTI::PeelingPreferences &PP) const override;
115
116 TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override {
117 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
119 }
120
121 unsigned getNumberOfRegisters(unsigned RCID) const override;
124 unsigned getMinVectorRegisterBitWidth() const override;
125 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override;
126 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
127 unsigned ChainSizeInBytes,
128 VectorType *VecTy) const override;
129 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
130 unsigned ChainSizeInBytes,
131 VectorType *VecTy) const override;
132 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override;
133
134 bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
135 unsigned AddrSpace) const;
136 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
137 unsigned AddrSpace) const override;
138 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
139 unsigned AddrSpace) const override;
140
143 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
144 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
145 std::optional<uint32_t> AtomicElementSize) const override;
146
148 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
149 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
150 Align SrcAlign, Align DestAlign,
151 std::optional<uint32_t> AtomicCpySize) const override;
152 unsigned getMaxInterleaveFactor(ElementCount VF) const override;
153
155 MemIntrinsicInfo &Info) const override;
156
158 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
160 TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
161 ArrayRef<const Value *> Args = {},
162 const Instruction *CxtI = nullptr) const override;
163
165 const Instruction *I = nullptr) const override;
166
167 bool isInlineAsmSourceOfDivergence(const CallInst *CI,
168 ArrayRef<unsigned> Indices = {}) const;
169
171 InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
173 unsigned Index, const Value *Op0,
174 const Value *Op1) const override;
175
176 bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
177 bool isSourceOfDivergence(const Value *V) const override;
178 bool isAlwaysUniform(const Value *V) const override;
179
180 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override {
181 // Address space casts must cast between different address spaces.
182 if (FromAS == ToAS)
183 return false;
184
185 // Casts between any aliasing address spaces are valid.
186 return AMDGPU::addrspacesMayAlias(FromAS, ToAS);
187 }
188
189 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const override {
190 return AMDGPU::addrspacesMayAlias(AS0, AS1);
191 }
192
193 unsigned getFlatAddressSpace() const override {
194 // Don't bother running InferAddressSpaces pass on graphics shaders which
195 // don't use flat addressing.
196 if (IsGraphics)
197 return -1;
199 }
200
202 Intrinsic::ID IID) const override;
203
204 bool
208 }
209
211 Value *NewV) const override;
212
213 bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
214 const Value *Op1, InstCombiner &IC) const;
215
217 unsigned LaneAgIdx) const;
218
219 std::optional<Instruction *>
221
224 const APInt &DemandedElts,
225 APInt &UndefElts) const;
226
228 IntrinsicInst &II) const;
229
230 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
231 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
232 APInt &UndefElts2, APInt &UndefElts3,
233 std::function<void(Instruction *, unsigned, APInt, APInt &)>
234 SimplifyAndSetOp) const override;
235
237
241 VectorType *SubTp, ArrayRef<const Value *> Args = {},
242 const Instruction *CxtI = nullptr) const override;
243
244 bool isProfitableToSinkOperands(Instruction *I,
245 SmallVectorImpl<Use *> &Ops) const override;
246
247 bool areInlineCompatible(const Function *Caller,
248 const Function *Callee) const override;
249
250 int getInliningLastCallToStaticBonus() const override;
251 unsigned getInliningThresholdMultiplier() const override { return 11; }
252 unsigned adjustInliningThreshold(const CallBase *CB) const override;
253 unsigned getCallerAllocaCost(const CallBase *CB,
254 const AllocaInst *AI) const override;
255
256 int getInlinerVectorBonusPercent() const override {
257 return InlinerVectorBonusPercent;
258 }
259
261 getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
262 std::optional<FastMathFlags> FMF,
263 TTI::TargetCostKind CostKind) const override;
264
267 TTI::TargetCostKind CostKind) const override;
270 TTI::TargetCostKind CostKind) const override;
271
272 /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
273 unsigned getCacheLineSize() const override { return 128; }
274
275 /// How much before a load we should place the prefetch instruction.
276 /// This is currently measured in number of IR instructions.
277 unsigned getPrefetchDistance() const override;
278
279 /// \return if target want to issue a prefetch in address space \p AS.
280 bool shouldPrefetchAddressSpace(unsigned AS) const override;
282 const Function &F,
283 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override;
284
285 enum class KnownIEEEMode { Unknown, On, Off };
286
287 /// Return KnownIEEEMode::On if we know if the use context can assume
288 /// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume
289 /// "amdgpu-ieee"="false".
291
292 /// Account for loads of i8 vector types to have reduced cost. For
293 /// example the cost of load 4 i8s values is one is the cost of loading
294 /// a single i32 value.
296 unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
299 const Instruction *I = nullptr) const override;
300
301 /// When counting parts on AMD GPUs, account for i8s being grouped
302 /// together under a single i32 value. Otherwise fall back to base
303 /// implementation.
304 unsigned getNumberOfParts(Type *Tp) const override;
305};
306
307} // end namespace llvm
308
309#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU address space definition.
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
RelocType Type
Definition: COFFYAML.cpp:410
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
uint32_t Index
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Machine InstCombiner
uint64_t IntrinsicInst * II
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Class for arbitrary precision integers.
Definition: APInt.h:78
an instruction to allocate memory on the stack
Definition: Instructions.h:64
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Base class which can be used to help build a TTI implementation.
Definition: BasicTTIImpl.h:82
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1116
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:22
Container class for subtarget features.
bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II, unsigned LaneAgIdx) const
Simplify a lane index operand (e.g.
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Account for loads of i8 vector types to have reduced cost.
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const override
Instruction * hoistLaneIntrinsicThroughOperand(InstCombiner &IC, IntrinsicInst &II) const
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
int getInlinerVectorBonusPercent() const override
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
unsigned getNumberOfRegisters(unsigned RCID) const override
bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const override
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getCacheLineSize() const override
Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
bool shouldPrefetchAddressSpace(unsigned AS) const override
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool hasBranchDivergence(const Function *F=nullptr) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
unsigned getInliningThresholdMultiplier() const override
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
unsigned adjustInliningThreshold(const CallBase *CB) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool isAlwaysUniform(const Value *V) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
bool isSourceOfDivergence(const Value *V) const override
int getInliningLastCallToStaticBonus() const override
unsigned getFlatAddressSpace() const override
InstructionCost getVectorSplitCost() const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
Value * simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts, APInt &UndefElts) const
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
unsigned getNumberOfParts(Type *Tp) const override
When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, const Value *Op1, InstCombiner &IC) const
unsigned getMinVectorRegisterBitWidth() const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override
The core instruction combiner logic.
Definition: InstCombiner.h:48
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
The optimization diagnostic interface.
The main scalar evolution driver.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
TargetSubtargetInfo - Generic base class for all target subtargets.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
TargetCostKind
The kind of cost model.
@ TCK_CodeSize
Instruction code size.
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:47
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM Value Representation.
Definition: Value.h:75
Base class of all SIMD vector types.
Definition: DerivedTypes.h:430
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static bool addrspacesMayAlias(unsigned AS1, unsigned AS2)
Definition: AMDGPU.h:571
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Length
Definition: DWP.cpp:477
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Information about a load/store intrinsic defined by the target.
Parameters that control the generic loop unrolling transformation.