23#define DEBUG_TYPE "amdgpu-wait-sgpr-hazards"
27 cl::desc(
"Enable required s_wait_alu on SGPR hazards"));
31 cl::desc(
"Cull hazards on function boundaries"));
36 cl::desc(
"Cull hazards on memory waits"));
40 cl::desc(
"Number of tracked SGPRs before initiating hazard cull on memory "
45class AMDGPUWaitSGPRHazards {
52 bool EnableSGPRHazardWaits;
53 bool CullSGPRHazardsOnFunctionBoundary;
54 bool CullSGPRHazardsAtMemWait;
55 unsigned CullSGPRHazardsMemWaitThreshold;
57 AMDGPUWaitSGPRHazards() {}
60 static std::optional<unsigned> sgprNumber(
Register Reg,
67 case AMDGPU::SGPR_NULL:
68 case AMDGPU::SGPR_NULL64:
73 unsigned RegN =
TRI.getHWRegIndex(Reg);
79 static inline bool isVCC(
Register Reg) {
80 return Reg == AMDGPU::VCC ||
Reg == AMDGPU::VCC_LO ||
Reg == AMDGPU::VCC_HI;
91 while (
I->isBundledWithPred())
97 if (
I->getOpcode() != AMDGPU::S_GETPC_B64)
101 const unsigned NewBytes = 4;
103 "Unexpected instruction insertion in bundle");
106 while (NextMI !=
End && NextMI->isBundledWithPred()) {
107 for (
auto &Operand : NextMI->operands()) {
108 if (Operand.isGlobal())
109 Operand.setOffset(Operand.getOffset() + NewBytes);
116 static constexpr unsigned None = 0;
117 static constexpr unsigned SALU = (1 << 0);
118 static constexpr unsigned VALU = (1 << 1);
120 std::bitset<64> Tracked;
121 std::bitset<128> SALUHazards;
122 std::bitset<128> VALUHazards;
123 unsigned VCCHazard =
None;
124 bool ActiveFlat =
false;
126 bool merge(
const HazardState &RHS) {
127 HazardState Orig(*
this);
129 return (*
this != Orig);
132 bool operator==(
const HazardState &RHS)
const {
133 return Tracked ==
RHS.Tracked && SALUHazards ==
RHS.SALUHazards &&
134 VALUHazards ==
RHS.VALUHazards && VCCHazard ==
RHS.VCCHazard &&
135 ActiveFlat ==
RHS.ActiveFlat;
138 bool operator!=(
const HazardState &RHS)
const {
return !(*
this ==
RHS); }
141 Tracked |=
RHS.Tracked;
142 SALUHazards |=
RHS.SALUHazards;
143 VALUHazards |=
RHS.VALUHazards;
144 VCCHazard |=
RHS.VCCHazard;
145 ActiveFlat |=
RHS.ActiveFlat;
149 struct BlockHazardState {
156 static constexpr unsigned WAVE32_NOPS = 4;
157 static constexpr unsigned WAVE64_NOPS = 8;
162 unsigned Count = DsNopCount;
168 enum { WA_VALU = 0x1, WA_SALU = 0x2, WA_VCC = 0x4 };
170 HazardState State = BlockState[&
MBB].In;
178 if (
MI->isMetaInstruction())
182 if (
MI->getOpcode() == AMDGPU::DS_NOP) {
183 if (++DsNops >= DsNopCount)
184 State.Tracked.reset();
192 State.ActiveFlat =
true;
196 State.VCCHazard = HazardState::None;
197 State.SALUHazards.reset();
198 State.VALUHazards.reset();
203 if (
MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
204 unsigned int Mask =
MI->getOperand(0).getImm();
206 State.VCCHazard &= ~HazardState::VALU;
208 State.SALUHazards.reset();
209 State.VCCHazard &= ~HazardState::SALU;
212 State.VALUHazards.reset();
217 if (CullSGPRHazardsAtMemWait &&
218 (
MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT ||
219 MI->getOpcode() == AMDGPU::S_WAIT_SAMPLECNT ||
220 MI->getOpcode() == AMDGPU::S_WAIT_BVHCNT) &&
221 (
MI->getOperand(0).isImm() &&
MI->getOperand(0).getImm() == 0) &&
222 (State.Tracked.count() >= CullSGPRHazardsMemWaitThreshold)) {
223 if (
MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT && State.ActiveFlat) {
224 State.ActiveFlat =
false;
226 State.Tracked.reset();
228 insertHazardCull(
MBB,
MI);
236 if (!IsVALU && !IsSALU)
246 if (!
TRI->isSGPRReg(*MRI, Reg))
250 if (!SeenRegs.
insert(Reg).second)
253 auto RegNumber = sgprNumber(Reg, *TRI);
259 unsigned RegN = *RegNumber;
260 unsigned PairN = (RegN >> 1) & 0x3f;
264 if (!State.Tracked[PairN]) {
266 State.Tracked.set(PairN);
277 if (State.VCCHazard & HazardState::VALU)
278 State.VCCHazard = HazardState::None;
280 State.VALUHazards.reset();
284 for (
uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
285 Wait |= State.SALUHazards[RegN + RegIdx] ? WA_SALU : 0;
286 Wait |= IsVALU && State.VALUHazards[RegN + RegIdx] ? WA_VALU : 0;
288 if (isVCC(Reg) && State.VCCHazard) {
291 if (State.VCCHazard & HazardState::SALU)
293 if (State.VCCHazard & HazardState::VALU)
299 State.VCCHazard = IsSALU ? HazardState::SALU : HazardState::VALU;
301 for (
uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
303 State.SALUHazards.set(RegN + RegIdx);
305 State.VALUHazards.set(RegN + RegIdx);
312 (
MI->isCall() ||
MI->isReturn() ||
MI->isIndirectBranch()) &&
313 MI->getOpcode() != AMDGPU::S_ENDPGM &&
314 MI->getOpcode() != AMDGPU::S_ENDPGM_SAVED;
317 const bool HasImplicitVCC =
319 [](
MCPhysReg Reg) { return isVCC(Reg); }) ||
321 [](
MCPhysReg Reg) { return isVCC(Reg); });
326 if (State.VCCHazard & HazardState::VALU)
328 if (State.SALUHazards.any() || (State.VCCHazard & HazardState::SALU))
330 if (State.VALUHazards.any())
332 if (CullSGPRHazardsOnFunctionBoundary && State.Tracked.any()) {
333 State.Tracked.reset();
335 insertHazardCull(
MBB,
MI);
341 if (
Op.isImplicit() &&
342 (!HasImplicitVCC || !
Op.isReg() || !isVCC(
Op.getReg())))
344 processOperand(
Op,
true);
350 unsigned Mask = 0xffff;
352 State.VCCHazard &= ~HazardState::VALU;
355 if (
Wait & WA_SALU) {
356 State.SALUHazards.reset();
357 State.VCCHazard &= ~HazardState::SALU;
360 if (
Wait & WA_VALU) {
361 State.VALUHazards.reset();
366 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
374 if (
MI->isCall() && !CullSGPRHazardsOnFunctionBoundary)
380 if (
Op.isImplicit() &&
381 (!HasImplicitVCC || !
Op.isReg() || !isVCC(
Op.getReg())))
383 processOperand(
Op,
false);
387 bool Changed = State != BlockState[&
MBB].Out;
389 assert(!Changed &&
"Hazard state should not change on emit pass");
393 BlockState[&
MBB].Out = State;
399 if (!
ST.hasVALUReadSGPRHazard())
410 "amdgpu-sgpr-hazard-wait", EnableSGPRHazardWaits);
412 CullSGPRHazardsOnFunctionBoundary =
415 CullSGPRHazardsAtMemWait =
418 CullSGPRHazardsMemWaitThreshold =
420 "amdgpu-sgpr-hazard-mem-wait-cull-threshold",
421 CullSGPRHazardsMemWaitThreshold);
424 if (!EnableSGPRHazardWaits)
427 TII =
ST.getInstrInfo();
428 TRI =
ST.getRegisterInfo();
430 DsNopCount =
ST.isWave64() ? WAVE64_NOPS : WAVE32_NOPS;
434 !CullSGPRHazardsOnFunctionBoundary) {
438 BlockState[&EntryBlock].In.Tracked.set();
452 while (!Worklist.
empty()) {
454 bool Changed = runOnMachineBasicBlock(
MBB,
false);
457 HazardState NewState = BlockState[&
MBB].Out;
461 auto &SuccState = BlockState[Succ];
462 if (Succ->getSinglePredecessor() && !Succ->isEntryBlock()) {
463 if (SuccState.In != NewState) {
464 SuccState.In = NewState;
467 }
else if (SuccState.In.merge(NewState)) {
477 bool Changed =
false;
479 Changed |= runOnMachineBasicBlock(
MBB,
true);
493 return AMDGPUWaitSGPRHazards().run(MF);
504char AMDGPUWaitSGPRHazardsLegacy::ID = 0;
509 "AMDGPU Insert waits for SGPR read hazards",
false,
false)
514 if (AMDGPUWaitSGPRHazards().run(MF))
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
static cl::opt< bool > GlobalCullSGPRHazardsAtMemWait("amdgpu-sgpr-hazard-mem-wait-cull", cl::init(false), cl::Hidden, cl::desc("Cull hazards on memory waits"))
static cl::opt< unsigned > GlobalCullSGPRHazardsMemWaitThreshold("amdgpu-sgpr-hazard-mem-wait-cull-threshold", cl::init(8), cl::Hidden, cl::desc("Number of tracked SGPRs before initiating hazard cull on memory " "wait"))
static cl::opt< bool > GlobalCullSGPRHazardsOnFunctionBoundary("amdgpu-sgpr-hazard-boundary-cull", cl::init(false), cl::Hidden, cl::desc("Cull hazards on function boundaries"))
static cl::opt< bool > GlobalEnableSGPRHazardWaits("amdgpu-sgpr-hazard-wait", cl::init(true), cl::Hidden, cl::desc("Enable required s_wait_alu on SGPR hazards"))
static void updateGetPCBundle(MachineInstr *NewMI)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file implements a set that has insertion order iteration characteristics.
A container for analyses that lazily runs them and caches their results.
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
This class represents an Operation in the Expression.
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
instr_iterator instr_begin()
Instructions::iterator instr_iterator
instr_iterator instr_end()
iterator_range< succ_iterator > successors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
A vector that has set insertion semantics.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
value_type pop_back_val()
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
int getNumOccurrences() const
self_iterator getIterator()
unsigned decodeFieldVaVcc(unsigned Encoded)
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ Emitted
Assigned address, still materializing.
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool operator!=(uint64_t V1, const APInt &V2)
char & AMDGPUWaitSGPRHazardsLegacyID
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
auto reverse(ContainerTy &&C)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
DWARFExpression::Operation Op
bool operator|=(SparseBitVector< ElementSize > &LHS, const SparseBitVector< ElementSize > *RHS)