22#define DEBUG_TYPE "amdgpu-insert-delay-alu"
26class AMDGPUInsertDelayAlu {
40 if (
MI.getDesc().TSFlags & VA_VDST_0)
42 if (
MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
43 MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
45 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
51 static bool instructionWaitsForSGPRWrites(
const MachineInstr &
MI) {
58 for (
auto &
Op :
MI.operands()) {
67 enum DelayType { VALU, TRANS, SALU, OTHER };
90 static constexpr unsigned VALU_MAX = 5;
94 static constexpr unsigned TRANS_MAX = 4;
98 static constexpr unsigned SALU_CYCLES_MAX = 4;
115 uint8_t TRANSNumVALU = VALU_MAX;
121 DelayInfo() =
default;
123 DelayInfo(DelayType
Type,
unsigned Cycles) {
132 TRANSCycles = Cycles;
139 SALUCycles = std::min(Cycles, SALU_CYCLES_MAX);
145 return VALUCycles ==
RHS.VALUCycles && VALUNum ==
RHS.VALUNum &&
146 TRANSCycles ==
RHS.TRANSCycles && TRANSNum ==
RHS.TRANSNum &&
147 TRANSNumVALU ==
RHS.TRANSNumVALU && SALUCycles ==
RHS.SALUCycles;
155 VALUCycles = std::max(VALUCycles,
RHS.VALUCycles);
156 VALUNum = std::min(VALUNum,
RHS.VALUNum);
157 TRANSCycles = std::max(TRANSCycles,
RHS.TRANSCycles);
158 TRANSNum = std::min(TRANSNum,
RHS.TRANSNum);
159 TRANSNumVALU = std::min(TRANSNumVALU,
RHS.TRANSNumVALU);
160 SALUCycles = std::max(SALUCycles,
RHS.SALUCycles);
166 bool advance(DelayType
Type,
unsigned Cycles) {
169 VALUNum += (
Type == VALU);
170 if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
176 VALUCycles -= Cycles;
180 TRANSNum += (
Type == TRANS);
181 TRANSNumVALU += (
Type == VALU);
182 if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
185 TRANSNum = TRANS_MAX;
186 TRANSNumVALU = VALU_MAX;
189 TRANSCycles -= Cycles;
193 if (SALUCycles <= Cycles) {
198 SALUCycles -= Cycles;
205#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
208 dbgs() <<
" VALUCycles=" << (int)VALUCycles;
209 if (VALUNum < VALU_MAX)
210 dbgs() <<
" VALUNum=" << (int)VALUNum;
212 dbgs() <<
" TRANSCycles=" << (int)TRANSCycles;
213 if (TRANSNum < TRANS_MAX)
214 dbgs() <<
" TRANSNum=" << (int)TRANSNum;
215 if (TRANSNumVALU < VALU_MAX)
216 dbgs() <<
" TRANSNumVALU=" << (int)TRANSNumVALU;
218 dbgs() <<
" SALUCycles=" << (int)SALUCycles;
224 struct DelayState :
DenseMap<unsigned, DelayInfo> {
228 for (
const auto &KV :
RHS) {
231 std::tie(It, Inserted) = insert(KV);
233 It->second.merge(KV.second);
239 void advance(DelayType
Type,
unsigned Cycles) {
241 for (
auto I = begin(), E = end();
I != E;
I = Next) {
243 if (
I->second.advance(
Type, Cycles))
248 void advanceByVALUNum(
unsigned VALUNum) {
250 for (
auto I = begin(), E = end();
I != E;
I = Next) {
252 if (
I->second.VALUNum >= VALUNum &&
I->second.VALUCycles > 0) {
258#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
261 dbgs() <<
" empty\n";
271 return A->first <
B->first;
291 if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
292 Imm |= 4 + Delay.TRANSNum;
296 if (Delay.VALUNum < DelayInfo::VALU_MAX &&
297 Delay.VALUNum <= Delay.TRANSNumVALU) {
299 Imm |= Delay.VALUNum << 7;
301 Imm |= Delay.VALUNum;
305 if (Delay.SALUCycles) {
306 assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
310 }
else if (Imm & 0xf) {
311 Imm |= (Delay.SALUCycles + 8) << 7;
313 Imm |= Delay.SALUCycles + 8;
323 if (!(Imm & 0x780) && LastDelayAlu) {
328 if (!
I->isBundle() && !
I->isMetaInstruction())
333 unsigned LastImm =
Op.getImm();
334 assert((LastImm & ~0xf) == 0 &&
335 "Remembered an s_delay_alu with no room for another delay!");
336 LastImm |= Imm << 7 | Skip << 4;
342 auto &
MBB = *
MI.getParent();
347 return (Imm & 0x780) ? nullptr : DelayAlu;
353 State.merge(BlockState[Pred]);
359 bool Changed =
false;
366 if (
MI.isBundle() ||
MI.isMetaInstruction())
370 switch (
MI.getOpcode()) {
371 case AMDGPU::SI_RETURN_TO_EPILOG:
375 DelayType
Type = getDelayType(
MI);
377 if (instructionWaitsForSGPRWrites(
MI)) {
378 auto It = State.find(LastSGPRFromVALU);
379 if (It != State.end()) {
380 DelayInfo
Info = It->getSecond();
381 State.advanceByVALUNum(
Info.VALUNum);
382 LastSGPRFromVALU = 0;
386 if (instructionWaitsForVALU(
MI)) {
389 State = DelayState();
390 }
else if (
Type != OTHER) {
393 for (
const auto &
Op :
MI.explicit_uses()) {
398 if (
MI.getOpcode() == AMDGPU::V_WRITELANE_B32 &&
Op.isTied())
401 auto It = State.find(Unit);
402 if (It != State.end()) {
403 Delay.merge(It->second);
411 for (
const auto &
Op :
MI.defs()) {
414 LastSGPRFromVALU = *
TRI->regunits(Reg).begin();
420 if (Emit && !
MI.isBundledWithPred()) {
423 LastDelayAlu = emitDelayAlu(
MI, Delay, LastDelayAlu);
429 for (
const auto &
Op :
MI.defs()) {
431 &
MI,
Op.getOperandNo(),
nullptr, 0);
444 State.advance(
Type, Cycles);
451 "Basic block state should not have changed on final pass!");
452 }
else if (DelayState &BS = BlockState[&
MBB]; State != BS) {
453 BS = std::move(State);
464 if (!ST->hasDelayAlu())
467 SII = ST->getInstrInfo();
468 TRI = ST->getRegisterInfo();
476 while (!WorkList.
empty()) {
478 bool Changed = runOnMachineBasicBlock(
MBB,
false);
487 bool Changed =
false;
489 Changed |= runOnMachineBasicBlock(
MBB,
true);
508 AMDGPUInsertDelayAlu Impl;
517 if (!AMDGPUInsertDelayAlu().
run(MF))
524char AMDGPUInsertDelayAluLegacy::ID = 0;
529 "AMDGPU Insert Delay ALU",
false,
false)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
AMD GCN specific subclass of TargetSubtarget.
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Interface definition for SIInstrInfo.
This file implements a set that has insertion order iteration characteristics.
A container for analyses that lazily runs them and caches their results.
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Represents analyses that only rely on functions' control flow.
This class represents an Operation in the Expression.
bool skipFunction(const Function &F) const
Optional passes call this function to check whether the pass should be skipped.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
bool isXDLWMMA(const MachineInstr &MI) const
static bool isSALU(const MachineInstr &MI)
const TargetSchedModel & getSchedModel() const
static bool isTRANS(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isVALU(const MachineInstr &MI)
A vector that has set insertion semantics.
void insert_range(Range &&R)
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
value_type pop_back_val()
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
The instances of the Type class are immutable: once they are created, they are never changed.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned decodeFieldVaVdst(unsigned Encoded)
bool isSGPR(MCRegister Reg, const MCRegisterInfo *TRI)
Is Reg - scalar register.
bool isGFX1250(const MCSubtargetInfo &STI)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool operator!=(uint64_t V1, const APInt &V2)
LLVM_ABI Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI)
Create Printable object to print register units on a raw_ostream.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
auto reverse(ContainerTy &&C)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
char & AMDGPUInsertDelayAluID
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &MFAM)