23#include "llvm/IR/IntrinsicsAMDGPU.h"
29#define DEBUG_TYPE "amdgpu-late-codegenprepare"
38 WidenLoads(
"amdgpu-late-codegenprepare-widen-constant-loads",
39 cl::desc(
"Widen sub-dword constant address space loads in "
40 "AMDGPULateCodeGenPrepare"),
45class AMDGPULateCodeGenPrepare
46 :
public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
59 :
F(
F),
DL(
F.getDataLayout()), ST(ST), AC(AC), UA(UA) {}
64 bool isDWORDAligned(
const Value *V)
const {
69 bool canWidenScalarExtLoad(
LoadInst &LI)
const;
75class LiveRegOptimizer {
82 Type *
const ConvertToScalar;
91 Type *calculateConvertType(
Type *OriginalType);
109 bool shouldReplace(
Type *ITy) {
114 const auto *TLI =
ST.getTargetLowering();
129 bool isOpLegal(
Instruction *
I) {
return isa<StoreInst, IntrinsicInst>(
I); }
137 for (
User *V :
II->users())
138 if (
auto *UseInst = dyn_cast<Instruction>(V))
142 if (
const auto *
Intr = dyn_cast<IntrinsicInst>(
II))
143 return Intr->getIntrinsicID() == Intrinsic::amdgcn_perm;
148 while (!UserList.
empty()) {
150 if (!CVisited.
insert(CII).second)
153 if (CII->getParent() ==
II->getParent() && !IsLookThru(
II))
161 if (
auto *UseInst = dyn_cast<Instruction>(V))
169 ConvertToScalar(
Type::getInt32Ty(
Mod.getContext())) {}
174bool AMDGPULateCodeGenPrepare::run() {
182 LiveRegOptimizer LRO(*
F.getParent(), ST);
184 bool Changed =
false;
186 bool HasScalarSubwordLoads =
ST.hasScalarSubwordLoads();
190 Changed |= !HasScalarSubwordLoads &&
visit(
I);
191 Changed |= LRO.optimizeLiveType(&
I, DeadInsts);
198Type *LiveRegOptimizer::calculateConvertType(
Type *OriginalType) {
204 TypeSize OriginalSize =
DL.getTypeSizeInBits(VTy);
205 TypeSize ConvertScalarSize =
DL.getTypeSizeInBits(ConvertToScalar);
206 unsigned ConvertEltCount =
207 (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
209 if (OriginalSize <= ConvertScalarSize)
213 ConvertEltCount,
false);
219 Type *NewTy = calculateConvertType(
V->getType());
221 TypeSize OriginalSize =
DL.getTypeSizeInBits(VTy);
222 TypeSize NewSize =
DL.getTypeSizeInBits(NewTy);
227 if (OriginalSize == NewSize)
228 return Builder.CreateBitCast(V, NewTy,
V->getName() +
".bc");
231 assert(NewSize > OriginalSize);
236 for (
unsigned I = 0;
I < OriginalElementCount;
I++)
239 for (
uint64_t I = OriginalElementCount;
I < ExpandedVecElementCount;
I++)
240 ShuffleMask.
push_back(OriginalElementCount);
242 Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask);
243 return Builder.CreateBitCast(ExpandedVec, NewTy,
V->getName() +
".bc");
251 TypeSize OriginalSize =
DL.getTypeSizeInBits(
V->getType());
252 TypeSize NewSize =
DL.getTypeSizeInBits(NewVTy);
256 if (OriginalSize == NewSize)
257 return Builder.CreateBitCast(V, NewVTy,
V->getName() +
".bc");
261 assert(OriginalSize > NewSize);
263 if (!
V->getType()->isVectorTy()) {
266 return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
275 cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT));
279 std::iota(ShuffleMask.
begin(), ShuffleMask.
end(), 0);
281 return Builder.CreateShuffleVector(Converted, ShuffleMask);
284bool LiveRegOptimizer::optimizeLiveType(
293 while (!Worklist.
empty()) {
299 if (!shouldReplace(
II->getType()))
302 if (!isCoercionProfitable(
II))
305 if (
PHINode *Phi = dyn_cast<PHINode>(
II)) {
308 for (
Value *V :
Phi->incoming_values()) {
310 if (
PHINode *OpPhi = dyn_cast<PHINode>(V)) {
311 if (!PhiNodes.
count(OpPhi) && !Visited.
count(OpPhi))
318 if (!IncInst && !isa<ConstantAggregateZero>(V))
328 for (
User *V :
II->users()) {
330 if (
PHINode *OpPhi = dyn_cast<PHINode>(V)) {
331 if (!PhiNodes.
count(OpPhi) && !Visited.
count(OpPhi))
338 if (UseInst->
getParent() !=
II->getParent() || isa<PHINode>(
II)) {
339 Uses.insert(UseInst);
340 if (!isa<PHINode>(
II))
350 Value *ConvertVal = convertToOptType(
D, InsertPt);
352 ValMap[
D] = ConvertVal;
357 for (
PHINode *Phi : PhiNodes) {
359 Phi->getNumIncomingValues(),
360 Phi->getName() +
".tc",
Phi->getIterator());
364 for (
PHINode *Phi : PhiNodes) {
365 PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
366 bool MissingIncVal =
false;
367 for (
int I = 0, E =
Phi->getNumIncomingValues();
I < E;
I++) {
368 Value *IncVal =
Phi->getIncomingValue(
I);
369 if (isa<ConstantAggregateZero>(IncVal)) {
370 Type *NewType = calculateConvertType(
Phi->getType());
371 NewPhi->
addIncoming(ConstantInt::get(NewType, 0,
false),
372 Phi->getIncomingBlock(
I));
376 MissingIncVal =
true;
385 while (!PHIWorklist.
empty()) {
387 VisitedPhis.
insert(NextDeadValue);
390 return ValMap[CandPhi] == NextDeadValue;
394 if (OriginalPhi != PhiNodes.end())
395 ValMap.
erase(*OriginalPhi);
397 DeadInsts.
emplace_back(cast<Instruction>(NextDeadValue));
400 if (!VisitedPhis.
contains(cast<PHINode>(U)))
413 Value *NewVal =
nullptr;
414 if (BBUseValMap.
contains(
U->getParent()) &&
415 BBUseValMap[
U->getParent()].contains(Val))
416 NewVal = BBUseValMap[
U->getParent()][Val];
422 if (isa<Instruction>(
Op) && !isa<PHINode>(
Op) &&
423 U->getParent() == cast<Instruction>(
Op)->getParent()) {
427 convertFromOptType(
Op->getType(), cast<Instruction>(ValMap[
Op]),
428 InsertPt,
U->getParent());
429 BBUseValMap[
U->getParent()][ValMap[
Op]] = NewVal;
433 U->setOperand(
OpIdx, NewVal);
441bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(
LoadInst &LI)
const {
454 unsigned TySize =
DL.getTypeStoreSize(Ty);
465bool AMDGPULateCodeGenPrepare::visitLoadInst(
LoadInst &LI) {
474 if (!canWidenScalarExtLoad(LI))
482 if (!isDWORDAligned(
Base))
485 int64_t Adjust =
Offset & 0x3;
496 unsigned LdBits =
DL.getTypeStoreSizeInBits(LI.
getType());
499 auto *NewPtr = IRB.CreateConstGEP1_64(
504 LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr,
Align(4));
506 NewLd->
setMetadata(LLVMContext::MD_range,
nullptr);
508 unsigned ShAmt = Adjust * 8;
509 Value *NewVal = IRB.CreateBitCast(
510 IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt),
511 DL.typeSizeEqualsStoreSize(LI.
getType()) ? IntNTy
526 bool Changed = AMDGPULateCodeGenPrepare(
F, ST, &AC, UI).run();
542 return "AMDGPU IR late optimizations";
565 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
567 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
569 return AMDGPULateCodeGenPrepare(
F, ST, &AC, UI).run();
573 "AMDGPU IR late optimizations",
false,
false)
aarch64 falkor hwpf fix late
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static bool runOnFunction(Function &F, bool PostInlining)
Legalize the Machine IR a function s Machine IR
Generic memory optimizations
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Remove Loads Into Fake Uses
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
Target-Independent Code Generator Pass Configuration Options pass.
bool runOnFunction(Function &F) override
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
AMDGPULateCodeGenPrepareLegacy()
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
InstListType::iterator iterator
Instruction iterators...
Represents analyses that only rely on functions' control flow.
This is the base class for all instructions that perform data casts.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
bool erase(const KeyT &Val)
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Class to represent fixed width SIMD vectors.
FunctionPass class - This class is used to implement most global optimizations.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
Base class for instruction visitors.
void visitInstruction(Instruction &I)
RetTy visitLoadInst(LoadInst &I)
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
A Module instance is used to store all the information related to an LLVM module.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
This instruction constructs a fixed permutation of two input vectors.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Target-Independent Code Generator Pass Configuration Options.
TMC & getTM() const
Get the right type of TargetMachine for this target.
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isAggregateType() const
Return true if the type is an aggregate type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
const ParentTy * getParent() const
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
NodeAddr< PhiNode * > Phi
This is an optimization pass for GlobalISel generic memory operations.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto reverse(ContainerTy &&C)
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
DWARFExpression::Operation Op
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
FunctionPass * createAMDGPULateCodeGenPrepareLegacyPass()
This struct is a compact representation of a valid (non-zero power of two) alignment.
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.