26struct MFMAPaddingRatioParser :
public cl::parser<unsigned> {
27 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(
O) {}
29 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg,
unsigned &
Value) {
31 return O.error(
"'" + Arg +
"' value invalid for uint argument!");
34 return O.error(
"'" + Arg +
"' value must be in the range [0, 100]!");
44 cl::desc(
"Fill a percentage of the latency between "
45 "neighboring MFMA with s_nops."));
50 cl::desc(
"Insert a s_nop x before every instruction"));
60 : IsHazardRecognizerMode(
false), CurrCycleInstr(nullptr), MF(MF),
61 ST(MF.getSubtarget<
GCNSubtarget>()), TII(*ST.getInstrInfo()),
62 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
64 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
69 EmittedInstrs.clear();
81 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
85 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
90 case AMDGPU::S_SETREG_B32:
91 case AMDGPU::S_SETREG_B32_mode:
92 case AMDGPU::S_SETREG_IMM32_B32:
93 case AMDGPU::S_SETREG_IMM32_B32_mode:
100 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
104 return Opcode == AMDGPU::S_RFE_B64;
109 case AMDGPU::S_MOVRELS_B32:
110 case AMDGPU::S_MOVRELS_B64:
111 case AMDGPU::S_MOVRELD_B32:
112 case AMDGPU::S_MOVRELD_B64:
121 if (
TII.isAlwaysGDS(
MI.getOpcode()))
124 switch (
MI.getOpcode()) {
125 case AMDGPU::S_SENDMSG:
126 case AMDGPU::S_SENDMSGHALT:
127 case AMDGPU::S_TTRACEDATA:
131 case AMDGPU::DS_PERMUTE_B32:
132 case AMDGPU::DS_BPERMUTE_B32:
135 if (
TII.isDS(
MI.getOpcode())) {
136 int GDS = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
137 AMDGPU::OpName::gds);
138 if (
MI.getOperand(GDS).getImm())
146 unsigned Opcode =
MI.getOpcode();
147 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
148 Opcode == AMDGPU::V_PERMLANE64_B32 ||
149 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
150 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
151 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
152 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
153 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
154 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
155 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
156 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
157 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
158 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
159 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
160 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
170 AMDGPU::OpName::simm16);
187 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(
MI) > 0)
190 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
193 if (ST.hasNoDataDepHazard())
205 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
208 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
213 checkMAIVALUHazards(
MI) > 0)
216 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
219 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
222 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
225 if (((ST.hasReadM0MovRelInterpHazard() &&
227 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
228 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
230 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
231 (ST.hasReadM0LdsDirectHazard() &&
232 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr))) &&
233 checkReadM0Hazards(
MI) > 0)
240 checkMAILdStHazards(
MI) > 0)
243 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
251 while (Quantity > 0) {
252 unsigned Arg = std::min(Quantity, 8u);
260GCNHazardRecognizer::getMFMAPipelineWaitStates(
const MachineInstr &
MI)
const {
261 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&
MI);
262 assert(TSchedModel.getWriteProcResBegin(SC) !=
263 TSchedModel.getWriteProcResEnd(SC));
264 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
267void GCNHazardRecognizer::processBundle() {
271 for (;
MI !=
E &&
MI->isInsideBundle(); ++
MI) {
272 CurrCycleInstr = &*
MI;
275 if (IsHazardRecognizerMode) {
276 fixHazards(CurrCycleInstr);
284 for (
unsigned i = 0, e = std::min(WaitStates,
MaxLookAhead - 1); i <
e; ++i)
285 EmittedInstrs.push_front(
nullptr);
287 EmittedInstrs.push_front(CurrCycleInstr);
290 CurrCycleInstr =
nullptr;
294 assert(IsHazardRecognizerMode);
298 if (
MI->isInsideBundle())
308 IsHazardRecognizerMode =
true;
312 CurrCycleInstr =
nullptr;
323 return std::max(WaitStates, checkSMRDHazards(
MI));
325 if (ST.hasNSAtoVMEMBug())
326 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
328 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
330 if (ST.hasNoDataDepHazard())
334 WaitStates = std::max(WaitStates, checkVMEMHazards(
MI));
337 WaitStates = std::max(WaitStates, checkVALUHazards(
MI));
340 WaitStates = std::max(WaitStates, checkDPPHazards(
MI));
343 WaitStates = std::max(WaitStates, checkDivFMasHazards(
MI));
346 WaitStates = std::max(WaitStates, checkRWLaneHazards(
MI));
350 checkMAIVALUHazards(
MI) > 0)
351 WaitStates = std::max(WaitStates, checkMAIVALUHazards(
MI));
353 if (
MI->isInlineAsm())
354 return std::max(WaitStates, checkInlineAsmHazards(
MI));
357 return std::max(WaitStates, checkGetRegHazards(
MI));
360 return std::max(WaitStates, checkSetRegHazards(
MI));
363 return std::max(WaitStates, checkRFEHazards(
MI));
365 if ((ST.hasReadM0MovRelInterpHazard() &&
367 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
368 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
370 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
371 (ST.hasReadM0LdsDirectHazard() &&
372 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr)))
373 return std::max(WaitStates, checkReadM0Hazards(
MI));
376 return std::max(WaitStates, checkMAIHazards(
MI));
379 return std::max(WaitStates, checkMAILdStHazards(
MI));
382 return std::max(WaitStates, checkPermlaneHazards(
MI));
388 EmittedInstrs.push_front(
nullptr);
394 if (!CurrCycleInstr) {
395 EmittedInstrs.push_front(
nullptr);
399 if (CurrCycleInstr->isBundle()) {
404 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
405 if (!NumWaitStates) {
406 CurrCycleInstr =
nullptr;
411 EmittedInstrs.push_front(CurrCycleInstr);
418 EmittedInstrs.push_front(
nullptr);
426 CurrCycleInstr =
nullptr;
430 assert(!IsHazardRecognizerMode &&
431 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
444template <
typename StateT>
454 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
459 static inline StateMapKey getEmptyKey() {
464 static inline StateMapKey getTombstoneKey() {
469 static unsigned getHashValue(
const StateMapKey &
Key) {
470 return StateT::getHashValue((*
Key.States)[
Key.Idx]);
472 static unsigned getHashValue(
const StateT &State) {
473 return StateT::getHashValue(State);
475 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
476 const auto EKey = getEmptyKey();
477 const auto TKey = getTombstoneKey();
478 if (StateMapKey::isEqual(
LHS, EKey) || StateMapKey::isEqual(
RHS, EKey) ||
479 StateMapKey::isEqual(
LHS, TKey) || StateMapKey::isEqual(
RHS, TKey))
480 return StateMapKey::isEqual(
LHS,
RHS);
481 return StateT::isEqual((*
LHS.States)[
LHS.Idx], (*
RHS.States)[
RHS.Idx]);
483 static bool isEqual(
const StateT &
LHS,
const StateMapKey &
RHS) {
484 if (StateMapKey::isEqual(
RHS, getEmptyKey()) ||
485 StateMapKey::isEqual(
RHS, getTombstoneKey()))
487 return StateT::isEqual(
LHS, (*
RHS.States)[
RHS.Idx]);
496 StateT State = InitialState;
499 unsigned WorkIdx = 0;
501 bool Expired =
false;
502 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
507 auto Result = IsHazard(State, *
I);
508 if (Result == HazardFound)
510 if (Result == HazardExpired) {
515 if (
I->isInlineAsm() ||
I->isMetaInstruction())
518 UpdateState(State, *
I);
522 unsigned StateIdx = States.
size();
523 StateMapKey
Key = {&States, StateIdx};
524 auto Insertion = StateMap.
insert_as(std::pair(
Key, StateIdx), State);
525 if (Insertion.second) {
528 StateIdx = Insertion.first->second;
531 Worklist.
insert(std::pair(Pred, StateIdx));
534 if (WorkIdx == Worklist.
size())
538 std::tie(
MBB, StateIdx) = Worklist[WorkIdx++];
539 State = States[StateIdx];
540 I =
MBB->instr_rbegin();
554 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
562 if (
I->isInlineAsm())
565 WaitStates += GetNumWaitStates(*
I);
567 if (IsExpired(*
I, WaitStates))
568 return std::numeric_limits<int>::max();
571 int MinWaitStates = std::numeric_limits<int>::max();
573 if (!Visited.
insert(Pred).second)
577 IsExpired, Visited, GetNumWaitStates);
579 MinWaitStates = std::min(MinWaitStates, W);
582 return MinWaitStates;
589 std::next(
MI->getReverseIterator()), 0, IsExpired,
593int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
int Limit) {
594 if (IsHazardRecognizerMode) {
595 auto IsExpiredFn = [Limit](
const MachineInstr &,
int WaitStates) {
596 return WaitStates >= Limit;
598 return ::getWaitStatesSince(IsHazard, CurrCycleInstr,
IsExpiredFn);
602 for (MachineInstr *
MI : EmittedInstrs) {
607 if (
MI->isInlineAsm())
612 if (WaitStates >= Limit)
615 return std::numeric_limits<int>::max();
618int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
619 IsHazardFn IsHazardDef,
621 const SIRegisterInfo *TRI = ST.getRegisterInfo();
624 return IsHazardDef(
MI) &&
MI.modifiesRegister(
Reg, TRI);
630int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
670int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM) {
673 if (!ST.isXNACKEnabled())
676 bool IsSMRD = TII.isSMRD(*MEM);
690 for (MachineInstr *
MI : EmittedInstrs) {
702 if (ClauseDefs.none())
715 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
718int GCNHazardRecognizer::checkSMRDHazards(
MachineInstr *SMRD) {
719 int WaitStatesNeeded = 0;
721 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
724 if (!ST.hasSMRDReadVALUDefHazard())
725 return WaitStatesNeeded;
729 int SmrdSgprWaitStates = 4;
730 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
731 return TII.isVALU(
MI);
733 auto IsBufferHazardDefFn = [
this](
const MachineInstr &
MI) {
734 return TII.isSALU(
MI);
737 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
739 for (
const MachineOperand &Use :
SMRD->uses()) {
742 int WaitStatesNeededForUse =
743 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
745 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
755 int WaitStatesNeededForUse =
756 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
759 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
763 return WaitStatesNeeded;
766int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr* VMEM) {
767 if (!ST.hasVMEMReadSGPRVALUDefHazard())
770 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
774 const int VmemSgprWaitStates = 5;
775 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
776 return TII.isVALU(
MI);
778 for (
const MachineOperand &Use : VMEM->uses()) {
779 if (!
Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(),
Use.getReg()))
782 int WaitStatesNeededForUse =
783 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
785 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
787 return WaitStatesNeeded;
791 const SIRegisterInfo *TRI = ST.getRegisterInfo();
792 const SIInstrInfo *TII = ST.getInstrInfo();
795 int DppVgprWaitStates = 2;
796 int DppExecWaitStates = 5;
797 int WaitStatesNeeded = 0;
798 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
799 return TII->isVALU(
MI);
802 for (
const MachineOperand &Use :
DPP->uses()) {
803 if (!
Use.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Use.getReg()))
805 int WaitStatesNeededForUse =
806 DppVgprWaitStates - getWaitStatesSinceDef(
808 [](
const MachineInstr &) { return true; },
810 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
813 WaitStatesNeeded = std::max(
815 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
818 return WaitStatesNeeded;
821int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas) {
822 const SIInstrInfo *TII = ST.getInstrInfo();
826 const int DivFMasWaitStates = 4;
827 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
828 return TII->isVALU(
MI);
830 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
833 return DivFMasWaitStates - WaitStatesNeeded;
836int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr) {
837 const SIInstrInfo *TII = ST.getInstrInfo();
838 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
840 const int GetRegWaitStates = 2;
841 auto IsHazardFn = [TII, GetRegHWReg](
const MachineInstr &
MI) {
844 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
846 return GetRegWaitStates - WaitStatesNeeded;
849int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr) {
850 const SIInstrInfo *TII = ST.getInstrInfo();
851 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
853 const int SetRegWaitStates = ST.getSetRegWaitStates();
854 auto IsHazardFn = [TII, HWReg](
const MachineInstr &
MI) {
857 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
858 return SetRegWaitStates - WaitStatesNeeded;
861int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI) {
865 const SIInstrInfo *TII = ST.getInstrInfo();
866 unsigned Opcode =
MI.getOpcode();
867 const MCInstrDesc &
Desc =
MI.getDesc();
869 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
872 VDataRCID =
Desc.operands()[VDataIdx].RegClass;
874 if (TII->isMUBUF(
MI) || TII->isMTBUF(
MI)) {
881 const MachineOperand *SOffset =
882 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
886 (!SOffset || !SOffset->
isReg()))
894 if (TII->isMIMG(
MI)) {
895 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
901 if (TII->isFLAT(
MI)) {
914GCNHazardRecognizer::checkVALUHazardsHelper(
const MachineOperand &Def,
918 const SIRegisterInfo *TRI = ST.getRegisterInfo();
920 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
921 int WaitStatesNeeded = 0;
923 if (!TRI->isVectorRegister(
MRI,
Def.getReg()))
924 return WaitStatesNeeded;
927 int DataIdx = createsVALUHazard(
MI);
928 return DataIdx >= 0 &&
929 TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(),
Reg);
932 int WaitStatesNeededForDef =
933 VALUWaitStates - getWaitStatesSince(
IsHazardFn, VALUWaitStates);
934 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
936 return WaitStatesNeeded;
952 unsigned Opcode =
MI.getOpcode();
962 if (
auto *DstSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel))
964 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
970 if (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src0_modifiers) &
972 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
976 (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src2_modifiers) &
978 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
984 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1005 for (
auto &Operand : VALU->operands()) {
1006 if (Operand.isReg() &&
TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1013int GCNHazardRecognizer::checkVALUHazards(
MachineInstr *VALU) {
1014 int WaitStatesNeeded = 0;
1017 const int TransDefWaitstates = 1;
1019 auto IsTransDefFn = [
this,
VALU](
const MachineInstr &
MI) {
1022 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1023 const SIInstrInfo *TII = ST.getInstrInfo();
1024 Register Def = TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)->getReg();
1026 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1027 if (
Use.isReg() && TRI->regsOverlap(Def,
Use.getReg()))
1034 int WaitStatesNeededForDef =
1035 TransDefWaitstates -
1036 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1037 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1040 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1041 const int Shift16DefWaitstates = 1;
1043 auto IsShift16BitDefFn = [
this,
VALU](
const MachineInstr &ProducerMI) {
1044 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1045 const MachineOperand *ForwardedDst =
1051 if (ProducerMI.isInlineAsm()) {
1053 for (
auto &Def : ProducerMI.all_defs()) {
1062 int WaitStatesNeededForDef =
1063 Shift16DefWaitstates -
1064 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1065 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1068 if (ST.hasVDecCoExecHazard()) {
1069 const int VALUWriteSGPRVALUReadWaitstates = 2;
1070 const int VALUWriteEXECRWLane = 4;
1071 const int VALUWriteVGPRReadlaneRead = 1;
1073 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1074 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1076 auto IsVALUDefSGPRFn = [&
UseReg, TRI](
const MachineInstr &
MI) {
1079 return MI.modifiesRegister(
UseReg, TRI);
1082 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1088 int WaitStatesNeededForDef =
1089 VALUWriteSGPRVALUReadWaitstates -
1090 getWaitStatesSince(IsVALUDefSGPRFn,
1091 VALUWriteSGPRVALUReadWaitstates);
1092 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1096 if (
VALU->readsRegister(AMDGPU::VCC, TRI)) {
1098 int WaitStatesNeededForDef =
1099 VALUWriteSGPRVALUReadWaitstates -
1100 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1101 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1104 switch (
VALU->getOpcode()) {
1105 case AMDGPU::V_READLANE_B32:
1106 case AMDGPU::V_READFIRSTLANE_B32: {
1107 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1109 int WaitStatesNeededForDef =
1110 VALUWriteVGPRReadlaneRead -
1111 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1112 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1115 case AMDGPU::V_WRITELANE_B32: {
1117 int WaitStatesNeededForDef =
1118 VALUWriteEXECRWLane -
1119 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1120 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1130 if (!ST.has12DWordStoreHazard())
1131 return WaitStatesNeeded;
1133 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1135 for (
const MachineOperand &Def :
VALU->defs()) {
1136 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def,
MRI));
1139 return WaitStatesNeeded;
1142int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA) {
1151 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1152 !ST.hasCvtScaleForwardingHazard())
1155 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1156 int WaitStatesNeeded = 0;
1158 for (
const MachineOperand &
Op :
1160 if (
Op.isReg() &&
Op.isDef()) {
1161 if (!TRI.isVectorRegister(
MRI,
Op.getReg()))
1164 if (ST.has12DWordStoreHazard()) {
1166 std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Op,
MRI));
1171 if (ST.hasDstSelForwardingHazard()) {
1172 const int Shift16DefWaitstates = 1;
1174 auto IsShift16BitDefFn = [
this, &
IA](
const MachineInstr &ProducerMI) {
1178 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1179 IA->readsRegister(Dst->getReg(), &TRI);
1181 if (ProducerMI.isInlineAsm()) {
1183 for (
auto &Def : ProducerMI.all_defs()) {
1184 if (
IA->modifiesRegister(
Def.getReg(), &TRI) ||
1185 IA->readsRegister(
Def.getReg(), &TRI)) {
1194 int WaitStatesNeededForDef =
1195 Shift16DefWaitstates -
1196 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1197 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1200 return WaitStatesNeeded;
1203int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane) {
1204 const SIInstrInfo *TII = ST.getInstrInfo();
1205 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1206 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1208 const MachineOperand *LaneSelectOp =
1209 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1211 if (!LaneSelectOp->
isReg() || !TRI->isSGPRReg(
MRI, LaneSelectOp->
getReg()))
1215 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isVALU(
MI); };
1217 const int RWLaneWaitStates = 4;
1218 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
1220 return RWLaneWaitStates - WaitStatesSince;
1223int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE) {
1224 if (!ST.hasRFEHazards())
1227 const SIInstrInfo *TII = ST.getInstrInfo();
1229 const int RFEWaitStates = 1;
1234 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
1235 return RFEWaitStates - WaitStatesNeeded;
1239 const SIInstrInfo *TII = ST.getInstrInfo();
1240 const int ReadM0WaitStates = 1;
1241 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isSALU(
MI); };
1242 return ReadM0WaitStates -
1243 getWaitStatesSinceDef(AMDGPU::M0,
IsHazardFn, ReadM0WaitStates);
1247 fixVMEMtoScalarWriteHazards(
MI);
1248 fixVcmpxPermlaneHazards(
MI);
1249 fixSMEMtoVectorWriteHazards(
MI);
1250 fixVcmpxExecWARHazard(
MI);
1251 fixLdsBranchVmemWARHazard(
MI);
1252 if (ST.hasLdsDirect()) {
1253 fixLdsDirectVALUHazard(
MI);
1254 fixLdsDirectVMEMHazard(
MI);
1256 fixVALUPartialForwardingHazard(
MI);
1257 fixVALUTransUseHazard(
MI);
1258 fixVALUTransCoexecutionHazards(
MI);
1260 fixWMMACoexecutionHazards(
MI);
1261 fixShift64HighRegBug(
MI);
1262 fixVALUMaskWriteHazard(
MI);
1263 fixRequiredExportPriority(
MI);
1264 if (ST.requiresWaitIdleBeforeGetReg())
1265 fixGetRegWaitIdle(
MI);
1266 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1267 fixDsAtomicAsyncBarrierArriveB64(
MI);
1268 if (ST.hasScratchBaseForwardingHazard())
1269 fixScratchBaseForwardingHazard(
MI);
1270 if (ST.setRegModeNeedsVNOPs())
1276 return (
TII.isVOPC(
MI) ||
1277 (
MI.isCompare() && (
TII.isVOP3(
MI) ||
TII.isSDWA(
MI)))) &&
1278 MI.modifiesRegister(AMDGPU::EXEC, &
TRI);
1281bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
1285 const SIInstrInfo *TII = ST.getInstrInfo();
1286 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1292 unsigned Opc =
MI.getOpcode();
1294 Opc != AMDGPU::V_NOP_e64 &&
Opc != AMDGPU::V_NOP_sdwa;
1298 std::numeric_limits<int>::max())
1304 auto *Src0 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
1306 bool IsUndef = Src0->isUndef();
1308 TII->get(AMDGPU::V_MOV_B32_e32))
1315bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
1316 if (!ST.hasVMEMtoScalarWriteHazard())
1318 assert(!ST.hasExtendedWaitCounts());
1323 if (
MI->getNumDefs() == 0)
1326 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1332 for (
const MachineOperand &Def :
MI->defs()) {
1333 const MachineOperand *
Op =
1334 I.findRegisterUseOperand(
Def.getReg(), TRI,
false);
1344 (
MI.getOpcode() == AMDGPU::S_WAITCNT &&
1345 !
MI.getOperand(0).getImm()) ||
1346 (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1351 std::numeric_limits<int>::max())
1354 const SIInstrInfo *TII = ST.getInstrInfo();
1356 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1361bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
1362 if (!ST.hasSMEMtoVectorWriteHazard())
1364 assert(!ST.hasExtendedWaitCounts());
1369 AMDGPU::OpName SDSTName;
1370 switch (
MI->getOpcode()) {
1371 case AMDGPU::V_READLANE_B32:
1372 case AMDGPU::V_READFIRSTLANE_B32:
1373 SDSTName = AMDGPU::OpName::vdst;
1376 SDSTName = AMDGPU::OpName::sdst;
1380 const SIInstrInfo *TII = ST.getInstrInfo();
1381 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1383 const MachineOperand *SDST = TII->getNamedOperand(*
MI, SDSTName);
1385 for (
const auto &MO :
MI->implicit_operands()) {
1386 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1397 auto IsHazardFn = [SDSTReg, TRI](
const MachineInstr &
I) {
1402 if (TII->isSALU(
MI)) {
1403 switch (
MI.getOpcode()) {
1404 case AMDGPU::S_SETVSKIP:
1405 case AMDGPU::S_VERSION:
1406 case AMDGPU::S_WAITCNT_VSCNT:
1407 case AMDGPU::S_WAITCNT_VMCNT:
1408 case AMDGPU::S_WAITCNT_EXPCNT:
1411 case AMDGPU::S_WAITCNT_LGKMCNT:
1413 return (
MI.getOperand(1).getImm() == 0) &&
1414 (
MI.getOperand(0).
getReg() == AMDGPU::SGPR_NULL);
1415 case AMDGPU::S_WAITCNT: {
1416 const int64_t
Imm =
MI.getOperand(0).getImm();
1419 return (Decoded.
DsCnt == 0);
1423 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1424 "unexpected wait count instruction");
1426 if (TII->isSOPP(
MI))
1442 std::numeric_limits<int>::max())
1446 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1451bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1452 if (!ST.hasVcmpxExecWARHazard())
1454 assert(!ST.hasExtendedWaitCounts());
1459 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1460 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1466 return I.readsRegister(AMDGPU::EXEC, TRI);
1469 const SIInstrInfo *TII = ST.getInstrInfo();
1470 auto IsExpiredFn = [TII, TRI](
const MachineInstr &
MI, int) {
1472 if (TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))
1474 for (
auto MO :
MI.implicit_operands())
1475 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1478 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1485 std::numeric_limits<int>::max())
1489 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1496 if (!ST.hasLdsBranchVmemWARHazard())
1501 bool HasLds =
false;
1502 bool HasVmem =
false;
1503 for (
auto &
MBB : MF) {
1504 for (
auto &
MI :
MBB) {
1508 if (HasLds && HasVmem)
1516 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1517 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1518 !
I.getOperand(1).getImm();
1521bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1522 if (!RunLdsBranchVmemWARHazardFixup)
1525 assert(ST.hasLdsBranchVmemWARHazard());
1526 assert(!ST.hasExtendedWaitCounts());
1528 auto IsHazardInst = [](
const MachineInstr &
MI) {
1537 auto InstType = IsHazardInst(*
MI);
1541 auto IsExpiredFn = [&IsHazardInst](
const MachineInstr &
I, int) {
1545 auto IsHazardFn = [InstType, &IsHazardInst](
const MachineInstr &
I) {
1549 auto IsHazardFn = [InstType, IsHazardInst](
const MachineInstr &
I) {
1550 auto InstType2 = IsHazardInst(
I);
1551 return InstType2 && InstType != InstType2;
1554 auto IsExpiredFn = [InstType, &IsHazardInst](
const MachineInstr &
I, int) {
1555 auto InstType2 = IsHazardInst(
I);
1556 if (InstType == InstType2)
1563 std::numeric_limits<int>::max();
1567 std::numeric_limits<int>::max())
1570 const SIInstrInfo *TII = ST.getInstrInfo();
1572 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1579bool GCNHazardRecognizer::fixLdsDirectVALUHazard(
MachineInstr *
MI) {
1583 const int NoHazardWaitStates = 15;
1584 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1587 bool VisitedTrans =
false;
1588 auto IsHazardFn = [
this, VDSTReg, &VisitedTrans](
const MachineInstr &
I) {
1593 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1595 auto IsExpiredFn = [&](
const MachineInstr &
I,
int WaitStates) {
1596 if (WaitStates >= NoHazardWaitStates)
1602 auto GetWaitStatesFn = [](
const MachineInstr &
MI) {
1606 DenseSet<const MachineBasicBlock *> Visited;
1608 std::next(
MI->getReverseIterator()), 0,
1616 MachineOperand *WaitVdstOp =
1617 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvdst);
1618 WaitVdstOp->
setImm(std::min(
Count, NoHazardWaitStates));
1623bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(
MachineInstr *
MI) {
1627 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1630 auto IsHazardFn = [
this, VDSTReg](
const MachineInstr &
I) {
1633 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1635 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1638 auto IsExpiredFn = [
this, LdsdirCanWait](
const MachineInstr &
I, int) {
1640 (
I.getOpcode() == AMDGPU::S_WAITCNT && !
I.getOperand(0).getImm()) ||
1641 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1644 !TII.getNamedOperand(
I, AMDGPU::OpName::waitvsrc)->getImm());
1648 std::numeric_limits<int>::max())
1651 if (LdsdirCanWait) {
1652 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1655 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1662bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(
MachineInstr *
MI) {
1663 if (!ST.hasVALUPartialForwardingHazard())
1665 assert(!ST.hasExtendedWaitCounts());
1670 SmallSetVector<Register, 4> SrcVGPRs;
1672 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1673 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1678 if (SrcVGPRs.
size() <= 1)
1696 const int Intv1plus2MaxVALUs = 2;
1697 const int Intv3MaxVALUs = 4;
1698 const int IntvMaxVALUs = 6;
1699 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1702 SmallDenseMap<Register, int, 4> DefPos;
1703 int ExecPos = std::numeric_limits<int>::max();
1706 static unsigned getHashValue(
const StateType &State) {
1710 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1711 return LHS.DefPos ==
RHS.DefPos &&
LHS.ExecPos ==
RHS.ExecPos &&
1719 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1721 if (State.VALUs > NoHazardVALUWaitStates)
1722 return HazardExpired;
1727 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1729 return HazardExpired;
1735 if (!State.DefPos.count(Src) &&
I.modifiesRegister(Src, &TRI)) {
1736 State.DefPos[Src] = State.VALUs;
1741 if (State.ExecPos == std::numeric_limits<int>::max()) {
1742 if (!State.DefPos.empty() &&
I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1743 State.ExecPos = State.VALUs;
1750 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1751 return HazardExpired;
1755 return NoHazardFound;
1758 if (State.ExecPos == std::numeric_limits<int>::max())
1759 return NoHazardFound;
1761 int PreExecPos = std::numeric_limits<int>::max();
1762 int PostExecPos = std::numeric_limits<int>::max();
1764 for (
auto Entry : State.DefPos) {
1765 int DefVALUs =
Entry.second;
1766 if (DefVALUs != std::numeric_limits<int>::max()) {
1767 if (DefVALUs >= State.ExecPos)
1768 PreExecPos = std::min(PreExecPos, DefVALUs);
1770 PostExecPos = std::min(PostExecPos, DefVALUs);
1775 if (PostExecPos == std::numeric_limits<int>::max())
1776 return NoHazardFound;
1779 int Intv3VALUs = PostExecPos;
1780 if (Intv3VALUs > Intv3MaxVALUs)
1781 return HazardExpired;
1784 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1785 if (Intv2VALUs > Intv1plus2MaxVALUs)
1786 return HazardExpired;
1789 if (PreExecPos == std::numeric_limits<int>::max())
1790 return NoHazardFound;
1793 int Intv1VALUs = PreExecPos - State.ExecPos;
1794 if (Intv1VALUs > Intv1plus2MaxVALUs)
1795 return HazardExpired;
1798 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1799 return HazardExpired;
1803 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1809 std::next(
MI->getReverseIterator())))
1813 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1819bool GCNHazardRecognizer::fixVALUTransUseHazard(
MachineInstr *
MI) {
1820 if (!ST.hasVALUTransUseHazard())
1822 assert(!ST.hasExtendedWaitCounts());
1827 SmallSet<Register, 4> SrcVGPRs;
1829 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1830 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1844 const int IntvMaxVALUs = 5;
1845 const int IntvMaxTRANS = 1;
1851 static unsigned getHashValue(
const StateType &State) {
1854 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1855 return LHS.VALUs ==
RHS.VALUs &&
LHS.TRANS ==
RHS.TRANS;
1862 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1864 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1865 return HazardExpired;
1870 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1872 return HazardExpired;
1877 if (
I.modifiesRegister(Src, &TRI)) {
1883 return NoHazardFound;
1885 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1893 std::next(
MI->getReverseIterator())))
1899 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1905bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(
MachineInstr *
MI) {
1910 const SIInstrInfo *TII = ST.getInstrInfo();
1911 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1913 auto IsTransHazardFn = [
MI, TII, TRI](
const MachineInstr &
I) {
1918 Register TransDef = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1919 for (
const MachineOperand &ValuUse :
MI->explicit_uses()) {
1920 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1924 auto *ValuDst = TII->getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1925 if (!ValuDst || !ValuDst->isReg())
1929 Register ValuDef = ValuDst->getReg();
1930 for (
const MachineOperand &TransUse :
I.explicit_uses()) {
1931 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1942 const int HasVALU = std::numeric_limits<int>::max();
1943 if (::getWaitStatesSince(IsTransHazardFn,
MI,
IsExpiredFn) == HasVALU)
1946 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1954 const SIInstrInfo *TII = ST.getInstrInfo();
1955 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1957 auto IsHazardFn = [
MI, TII, TRI,
this](
const MachineInstr &
I) {
1964 TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
1966 TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
1969 TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1971 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1972 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1981 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
1982 if (TRI->regsOverlap(PrevDstReg, CurIndex))
1996 std::numeric_limits<int>::max())
1999 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2011 unsigned Category) {
2013 "Handle me if the xdl wmma instruction latency changes");
2050bool GCNHazardRecognizer::fixWMMACoexecutionHazards(
MachineInstr *
MI) {
2054 const SIInstrInfo *TII = ST.getInstrInfo();
2058 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2065 const int WMMAWaitStates[] = {5, 9, 3, 5};
2066 const int VALUWaitStates[] = {4, 8, 2, 4};
2067 unsigned Category = 0;
2069 auto IsWMMAHazardFn = [
MI, TII, TRI, &Category,
this](
const MachineInstr &
I) {
2070 if (!TII->isXDLWMMA(
I))
2073 unsigned Latency = TSchedModel.computeInstrLatency(&
I);
2077 Register D0 = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2078 Register A1 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
2079 Register B1 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
2082 if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))
2086 Register Idx1 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
2087 if (TRI->regsOverlap(D0, Idx1))
2094 auto IsVALUHazardFn = [
MI, TII, TRI, &Category,
this](
const MachineInstr &
I) {
2095 if (!TII->isXDLWMMA(
I))
2098 unsigned Latency = TSchedModel.computeInstrLatency(&
I);
2103 Register D0 = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2104 for (
const MachineOperand &ValuUse :
MI->explicit_uses()) {
2105 if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))
2109 auto *ValuDst = TII->getNamedOperand(*
MI, AMDGPU::OpName::vdst);
2110 if (!ValuDst || !ValuDst->isReg())
2115 if (TRI->regsOverlap(D0, D1))
2119 Register A0 = TII->getNamedOperand(
I, AMDGPU::OpName::src0)->getReg();
2120 Register B0 = TII->getNamedOperand(
I, AMDGPU::OpName::src1)->getReg();
2121 if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))
2125 Register Idx0 = TII->getNamedOperand(
I, AMDGPU::OpName::src2)->getReg();
2126 if (TRI->regsOverlap(D1, Idx0))
2134 auto IsExpiredFn = [&Limit](
const MachineInstr &,
int WaitStates) {
2135 return WaitStates >= Limit;
2138 auto GetWaitStatesFn = [](
const MachineInstr &
I) {
2142 int WaitStatesNeeded = -1;
2143 if (TII->isXDLWMMA(*
MI)) {
2144 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2145 Limit = WMMAWaitStates[Category];
2146 DenseSet<const MachineBasicBlock *> Visited;
2152 Limit - ::getWaitStatesSince(IsWMMAHazardFn,
MI->getParent(),
2153 std::next(
MI->getReverseIterator()), 0,
2157 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2158 Limit = VALUWaitStates[Category];
2159 DenseSet<const MachineBasicBlock *> Visited;
2165 Limit - ::getWaitStatesSince(IsVALUHazardFn,
MI->getParent(),
2166 std::next(
MI->getReverseIterator()), 0,
2173 for (
int i = 0; i < WaitStatesNeeded; i++)
2175 TII->get(AMDGPU::V_NOP_e32));
2180bool GCNHazardRecognizer::fixShift64HighRegBug(
MachineInstr *
MI) {
2181 if (!ST.hasShift64HighRegBug())
2183 assert(!ST.hasExtendedWaitCounts());
2185 switch (
MI->getOpcode()) {
2188 case AMDGPU::V_LSHLREV_B64_e64:
2189 case AMDGPU::V_LSHRREV_B64_e64:
2190 case AMDGPU::V_ASHRREV_I64_e64:
2194 MachineOperand *Amt = TII.getNamedOperand(*
MI, AMDGPU::OpName::src0);
2199 const MachineRegisterInfo &
MRI = MF.getRegInfo();
2201 if (!TRI.isVGPR(
MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2204 if (AmtReg != AMDGPU::VGPR255 &&
MRI.isPhysRegUsed(AmtReg + 1))
2207 MachineOperand *Src1 = TII.getNamedOperand(*
MI, AMDGPU::OpName::src1);
2208 bool OverlappedSrc = Src1->
isReg() && TRI.regsOverlap(Src1->
getReg(), AmtReg);
2209 bool OverlappedDst =
MI->modifiesRegister(AmtReg, &TRI);
2210 bool Overlapped = OverlappedSrc || OverlappedDst;
2212 assert(!OverlappedDst || !OverlappedSrc ||
2213 Src1->
getReg() ==
MI->getOperand(0).getReg());
2214 assert(ST.needsAlignedVGPRs());
2215 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2218 for (MCRegister
Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2219 : AMDGPU::VGPR_32RegClass) {
2220 if (!
MI->modifiesRegister(
Reg, &TRI) && !
MI->readsRegister(
Reg, &TRI)) {
2226 Register NewAmt = Overlapped ? (
Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2231 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2234 MachineBasicBlock *
MBB =
MI->getParent();
2246 runOnInstruction(
BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2253 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2259 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2273 MI->getOperand(0).setReg(NewReg);
2274 if (OverlappedSrc) {
2284 int NSAtoVMEMWaitStates = 1;
2286 if (!ST.hasNSAtoVMEMBug())
2292 const SIInstrInfo *TII = ST.getInstrInfo();
2293 const auto *
Offset = TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
2301 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2302 TII->getInstSizeInBytes(
I) >= 16;
2305 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
2308int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
MachineInstr *
MI) {
2309 int FPAtomicToDenormModeWaitStates = 3;
2311 if (!ST.hasFPAtomicToDenormModeHazard())
2313 assert(!ST.hasExtendedWaitCounts());
2315 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2324 auto IsExpiredFn = [](
const MachineInstr &
MI,
int WaitStates) {
2331 return FPAtomicToDenormModeWaitStates -
2338 return ST.hasGFX90AInsts() ? checkMAIHazards90A(
MI) : checkMAIHazards908(
MI);
2346 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2350 int NeighborMFMALatency = 0;
2351 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2352 this](
const MachineInstr &
MI) {
2356 NeighborMFMALatency = this->getMFMAPipelineWaitStates(
MI);
2360 const int MaxMFMAPipelineWaitStates = 16;
2361 int WaitStatesSinceNeighborMFMA =
2362 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2364 int NeighborMFMAPaddingNeeded =
2366 WaitStatesSinceNeighborMFMA;
2368 return std::max(0, NeighborMFMAPaddingNeeded);
2372 int WaitStatesNeeded = 0;
2373 unsigned Opc =
MI->getOpcode();
2375 auto IsVALUFn = [](
const MachineInstr &
MI) {
2379 if (
Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
2380 const int LegacyVALUWritesVGPRWaitStates = 2;
2381 const int VALUWritesExecWaitStates = 4;
2382 const int MaxWaitStates = 4;
2384 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2385 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2386 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2388 if (WaitStatesNeeded < MaxWaitStates) {
2389 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2390 const int MaxWaitStates = 2;
2392 if (!
Use.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
2395 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2396 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
2397 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2399 if (WaitStatesNeeded == MaxWaitStates)
2405 for (
const MachineOperand &
Op :
MI->explicit_operands()) {
2406 if (!
Op.isReg() || !TRI.isAGPR(MF.getRegInfo(),
Op.getReg()))
2409 if (
Op.isDef() &&
Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2412 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2413 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2414 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2415 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2416 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2417 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2418 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2419 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2420 const int MaxWaitStates = 18;
2422 unsigned HazardDefLatency = 0;
2424 auto IsOverlappedMFMAFn = [
Reg, &HazardDefLatency,
2425 this](
const MachineInstr &
MI) {
2432 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2433 return TRI.regsOverlap(DstReg,
Reg);
2436 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn,
2438 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2439 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2440 int OpNo =
Op.getOperandNo();
2441 if (OpNo == SrcCIdx) {
2442 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2443 }
else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2444 switch (HazardDefLatency) {
2445 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2447 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2449 case 16: [[fallthrough]];
2450 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2453 }
else if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2454 switch (HazardDefLatency) {
2455 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2457 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2459 case 16: [[fallthrough]];
2460 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2465 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2466 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2468 if (WaitStatesNeeded == MaxWaitStates)
2469 return WaitStatesNeeded;
2471 auto IsAccVgprWriteFn = [
Reg,
this](
const MachineInstr &
MI) {
2472 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2475 return TRI.regsOverlap(
Reg, DstReg);
2478 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2479 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2480 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2481 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2482 if (OpNo == SrcCIdx)
2483 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2484 else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2485 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2487 WaitStatesNeededForUse = NeedWaitStates -
2488 getWaitStatesSinceDef(
Reg, IsAccVgprWriteFn, MaxWaitStates);
2489 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2491 if (WaitStatesNeeded == MaxWaitStates)
2492 return WaitStatesNeeded;
2495 if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2496 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2497 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2498 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2499 const int MaxWaitStates = 13;
2500 Register DstReg =
MI->getOperand(0).getReg();
2501 unsigned HazardDefLatency = 0;
2503 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2504 this](
const MachineInstr &
MI) {
2507 Register Reg = TII.getNamedOperand(
MI, AMDGPU::OpName::src2)->getReg();
2509 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2510 return TRI.regsOverlap(
Reg, DstReg);
2513 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2515 switch (HazardDefLatency) {
2516 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2518 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2520 case 16: [[fallthrough]];
2521 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2525 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2526 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2530 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2532 return WaitStatesNeeded;
2543 return NumPasses + 1 + IsGFX950;
2554 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2572 return NumPasses + 2;
2582 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2586 int WaitStatesNeeded = 0;
2587 unsigned Opc =
MI->getOpcode();
2589 auto IsLegacyVALUFn = [](
const MachineInstr &
MI) {
2593 auto IsLegacyVALUNotDotFn = [](
const MachineInstr &
MI) {
2599 return WaitStatesNeeded;
2601 const int VALUWritesExecWaitStates = 4;
2602 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2603 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2604 VALUWritesExecWaitStates);
2605 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2607 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2610 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2611 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2612 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2613 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2614 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2615 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2616 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2617 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2618 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2619 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2620 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2621 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2622 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2623 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2624 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2625 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2626 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2627 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2628 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2629 const int MaxWaitStates = 19;
2635 const MachineInstr *MI1;
2637 auto IsOverlappedMFMAFn = [
Reg, &FullReg, &MI1,
2638 this](
const MachineInstr &
MI) {
2642 FullReg = (DstReg ==
Reg);
2644 return TRI.regsOverlap(DstReg,
Reg);
2647 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2648 getWaitStatesSinceDef(
Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2649 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2652 getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn, MaxWaitStates);
2653 if (NumWaitStates == std::numeric_limits<int>::max())
2656 int OpNo =
Use.getOperandNo();
2658 int NeedWaitStates = 0;
2659 if (OpNo == SrcCIdx) {
2663 }
else if (FullReg) {
2664 if ((
Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2665 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2666 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2667 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2668 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2669 else if (ST.hasGFX940Insts() &&
2670 TSchedModel.computeInstrLatency(MI1) == 2)
2671 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2674 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2675 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2676 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2677 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2678 if (!TII.isXDL(*
MI))
2681 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2682 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2684 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2685 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2686 if (!TII.isXDL(*
MI))
2687 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2690 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2691 if (ST.hasGFX940Insts()) {
2692 if (TII.isXDL(*
MI) && !TII.isXDL(*MI1))
2699 NumPasses, ST.hasGFX950Insts())
2701 NumPasses, ST.hasGFX950Insts()))
2707 switch (NumPasses) {
2711 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2712 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2717 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2718 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2723 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2724 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2733 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2734 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2735 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2736 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2739 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2740 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2742 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2743 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2744 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2747 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2749 if (ST.hasGFX940Insts()) {
2753 NumPasses, ST.hasGFX950Insts())
2759 switch (NumPasses) {
2761 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2766 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2770 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2774 if (WaitStatesNeeded >= NeedWaitStates)
2777 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2778 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2780 if (WaitStatesNeeded == MaxWaitStates)
2785 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2787 return WaitStatesNeeded;
2792 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2795 int WaitStatesNeeded = 0;
2797 auto IsAccVgprReadFn = [](
const MachineInstr &
MI) {
2798 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2801 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
2802 if (!
Op.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Op.getReg()))
2807 const int AccVgprReadLdStWaitStates = 2;
2808 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2809 const int MaxWaitStates = 2;
2811 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2812 getWaitStatesSinceDef(
Reg, IsAccVgprReadFn, MaxWaitStates);
2813 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2815 if (WaitStatesNeeded == MaxWaitStates)
2816 return WaitStatesNeeded;
2818 auto IsVALUAccVgprRdWrCheckFn = [
Reg,
this](
const MachineInstr &
MI) {
2819 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2820 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2822 auto IsVALUFn = [](
const MachineInstr &
MI) {
2825 return getWaitStatesSinceDef(
Reg, IsVALUFn, 2 ) <
2826 std::numeric_limits<int>::max();
2829 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2830 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2831 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2834 return WaitStatesNeeded;
2838 assert(!ST.hasVcmpxPermlaneHazard() &&
2839 "this is a different vcmpx+permlane hazard");
2840 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2841 const SIInstrInfo *TII = ST.getInstrInfo();
2843 auto IsVCmpXWritesExecFn = [TII, TRI](
const MachineInstr &
MI) {
2847 auto IsVALUFn = [](
const MachineInstr &
MI) {
2851 const int VCmpXWritesExecWaitStates = 4;
2852 const int VALUWritesVDstWaitStates = 2;
2853 int WaitStatesNeeded = 0;
2855 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
2856 if (!
Op.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Op.getReg()))
2860 int WaitStatesSinceDef =
2861 VALUWritesVDstWaitStates -
2862 getWaitStatesSinceDef(
Reg, IsVALUFn,
2863 VALUWritesVDstWaitStates);
2864 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2865 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2869 int VCmpXHazardWaits =
2870 VCmpXWritesExecWaitStates -
2871 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2873 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2874 return WaitStatesNeeded;
2882 return NumPasses + 2;
2892 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2902 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2910 return NumPasses + 2;
2914 if (!ST.hasGFX90AInsts())
2917 auto IsDGEMMFn = [](
const MachineInstr &
MI) ->
bool {
2925 const MachineRegisterInfo &
MRI = MF.getRegInfo();
2927 int WaitStatesNeeded = 0;
2933 const MachineInstr *
MFMA =
nullptr;
2935 auto IsMFMAWriteFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
2937 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
2943 const MachineInstr *
DOT =
nullptr;
2944 auto IsDotWriteFn = [&
Reg, &
DOT,
this](
const MachineInstr &
MI) {
2946 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
2952 bool DGEMMAfterVALUWrite =
false;
2953 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite,
this](
const MachineInstr &
MI) {
2956 DGEMMAfterVALUWrite =
true;
2960 if (!TII.isVALU(
MI) || !DGEMMAfterVALUWrite)
2966 int SrcCIdx = AMDGPU::getNamedOperandIdx(
MI->getOpcode(),
2967 AMDGPU::OpName::src2);
2969 if (IsMemOrExport || IsVALU) {
2970 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2971 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2972 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2973 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2974 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2975 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2976 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2977 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2978 const int DotWriteSameDotReadSrcAB = 3;
2979 const int DotWriteDifferentVALURead = 3;
2980 const int DMFMABetweenVALUWriteVMEMRead = 2;
2981 const int MaxWaitStates = 19;
2983 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2989 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
2992 int NeedWaitStates = 0;
2993 if (
DOT->getOpcode() ==
MI->getOpcode()) {
2994 if (&Use - &
MI->getOperand(0) != SrcCIdx)
2995 NeedWaitStates = DotWriteSameDotReadSrcAB;
2997 NeedWaitStates = DotWriteDifferentVALURead;
3000 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3001 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3008 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3009 DGEMMAfterVALUWrite =
false;
3010 if (TRI.isVectorRegister(
MRI,
Reg)) {
3011 int WaitStatesNeededForUse =
3012 DMFMABetweenVALUWriteVMEMRead -
3013 getWaitStatesSinceDef(
Reg, IsDGEMMHazard,
3014 DMFMABetweenVALUWriteVMEMRead);
3016 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3021 WaitStatesSinceDef =
3022 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3026 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3027 int NumPasses = HazardDefLatency;
3028 int NeedWaitStates = MaxWaitStates;
3031 switch (HazardDefLatency) {
3033 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3034 : DMFMA4x4WriteVgprVALUReadWaitStates;
3040 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3041 : (ST.hasGFX950Insts()
3042 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3043 : DMFMA16x16WriteVgprVALUReadWaitStates);
3048 }
else if (ST.hasGFX940Insts()) {
3052 NumPasses, ST.hasGFX950Insts())
3056 switch (HazardDefLatency) {
3058 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3061 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3064 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3071 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3072 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3074 if (WaitStatesNeeded == MaxWaitStates)
3079 unsigned Opc =
MI->getOpcode();
3080 const int DMFMAToFMA64WaitStates = 2;
3081 if ((
Opc == AMDGPU::V_FMA_F64_e64 ||
3082 Opc == AMDGPU::V_FMAC_F64_e32 ||
Opc == AMDGPU::V_FMAC_F64_e64 ||
3083 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3084 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3085 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3086 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3087 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3090 if (!IsVALU && !IsMemOrExport)
3091 return WaitStatesNeeded;
3093 for (
const MachineOperand &Def :
MI->defs()) {
3094 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3095 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3096 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3097 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3098 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3099 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3100 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3101 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3102 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3103 const int DotWriteDifferentVALUWrite = 3;
3104 const int MaxWaitStates = 19;
3105 const int MaxWarWaitStates = 15;
3110 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3112 if (DOT &&
DOT->getOpcode() !=
MI->getOpcode())
3113 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3114 WaitStatesSinceDef);
3117 WaitStatesSinceDef =
3118 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3120 int NeedWaitStates = MaxWaitStates;
3121 int NumPasses = TSchedModel.computeInstrLatency(
MFMA);
3124 switch (NumPasses) {
3126 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3130 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3135 }
else if (ST.hasGFX940Insts()) {
3139 NumPasses, ST.hasGFX950Insts())
3142 switch (NumPasses) {
3144 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3147 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3150 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3157 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3158 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3160 if (WaitStatesNeeded == MaxWaitStates)
3164 auto IsSMFMAReadAsCFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
3166 !
MI.readsRegister(
Reg, &TRI))
3169 if (ST.hasGFX940Insts() && !TII.isXDL(
MI))
3172 const MachineOperand *SrcC =
3173 TII.getNamedOperand(
MI, AMDGPU::OpName::src2);
3183 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3188 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3189 int NeedWaitStates = MaxWaitStates;
3190 switch (HazardDefLatency) {
3191 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3193 case 4:
assert(ST.hasGFX940Insts());
3194 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3196 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3198 case 16: [[fallthrough]];
3199 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3203 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3204 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3207 return WaitStatesNeeded;
3220 return MAI !=
nullptr;
3224 if (IsMFMAFn(*
MI)) {
3225 int W = getWaitStatesSince(IsMFMAFn, 16);
3227 return W < (int)TSchedModel.computeInstrLatency(MAI);
3241 while (
I->isBundledWithPred())
3247 if (
I->getOpcode() != AMDGPU::S_GETPC_B64)
3251 const unsigned NewBytes = 4;
3253 "Unexpected instruction insertion in bundle");
3256 while (NextMI != End && NextMI->isBundledWithPred()) {
3257 for (
auto &Operand : NextMI->operands()) {
3258 if (Operand.isGlobal())
3259 Operand.setOffset(Operand.getOffset() + NewBytes);
3265bool GCNHazardRecognizer::fixVALUMaskWriteHazard(
MachineInstr *
MI) {
3266 if (!ST.hasVALUMaskWriteHazard())
3268 assert(!ST.hasExtendedWaitCounts());
3281 const MachineOperand *SDSTOp = TII.getNamedOperand(*
MI, AMDGPU::OpName::sdst);
3282 if (!SDSTOp || !SDSTOp->
isReg())
3286 if (HazardReg == AMDGPU::EXEC ||
3287 HazardReg == AMDGPU::EXEC_LO ||
3288 HazardReg == AMDGPU::EXEC_HI ||
3289 HazardReg == AMDGPU::M0)
3292 auto IsHazardFn = [HazardReg,
this](
const MachineInstr &
I) {
3293 switch (
I.getOpcode()) {
3294 case AMDGPU::V_ADDC_U32_e32:
3295 case AMDGPU::V_ADDC_U32_dpp:
3296 case AMDGPU::V_CNDMASK_B16_t16_e32:
3297 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3298 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3299 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3300 case AMDGPU::V_CNDMASK_B32_e32:
3301 case AMDGPU::V_CNDMASK_B32_dpp:
3302 case AMDGPU::V_DIV_FMAS_F32_e64:
3303 case AMDGPU::V_DIV_FMAS_F64_e64:
3304 case AMDGPU::V_SUBB_U32_e32:
3305 case AMDGPU::V_SUBB_U32_dpp:
3306 case AMDGPU::V_SUBBREV_U32_e32:
3307 case AMDGPU::V_SUBBREV_U32_dpp:
3309 return HazardReg == AMDGPU::VCC ||
3310 HazardReg == AMDGPU::VCC_LO ||
3311 HazardReg == AMDGPU::VCC_HI;
3312 case AMDGPU::V_ADDC_U32_e64:
3313 case AMDGPU::V_ADDC_U32_e64_dpp:
3314 case AMDGPU::V_CNDMASK_B16_t16_e64:
3315 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3316 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3317 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3318 case AMDGPU::V_CNDMASK_B32_e64:
3319 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3320 case AMDGPU::V_SUBB_U32_e64:
3321 case AMDGPU::V_SUBB_U32_e64_dpp:
3322 case AMDGPU::V_SUBBREV_U32_e64:
3323 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3325 const MachineOperand *SSRCOp = TII.getNamedOperand(
I, AMDGPU::OpName::src2);
3327 return TRI.regsOverlap(SSRCOp->
getReg(), HazardReg);
3334 const MachineRegisterInfo &
MRI = MF.getRegInfo();
3337 if (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3346 for (
int OpNo = 0, End =
I.getNumOperands(); OpNo < End; ++OpNo) {
3347 const MachineOperand &
Op =
I.getOperand(OpNo);
3354 if (OpReg == AMDGPU::EXEC ||
3355 OpReg == AMDGPU::EXEC_LO ||
3356 OpReg == AMDGPU::EXEC_HI)
3359 if (
Op.isImplicit()) {
3360 if (OpReg == AMDGPU::VCC ||
3361 OpReg == AMDGPU::VCC_LO ||
3362 OpReg == AMDGPU::VCC_HI)
3366 if (TRI.isSGPRReg(
MRI, OpReg))
3369 const MCInstrDesc &InstDesc =
I.getDesc();
3370 const MCOperandInfo &OpInfo = InstDesc.
operands()[OpNo];
3371 if (!TII.isInlineConstant(
Op, OpInfo))
3380 std::numeric_limits<int>::max())
3383 auto NextMI = std::next(
MI->getIterator());
3386 auto NewMI =
BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
3387 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3399 if (EntryMBB.
begin() != EntryMBB.
end()) {
3400 auto &EntryMI = *EntryMBB.
begin();
3401 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3402 EntryMI.getOperand(0).getImm() >= Priority)
3411bool GCNHazardRecognizer::fixRequiredExportPriority(
MachineInstr *
MI) {
3412 if (!ST.hasRequiredExportPriority())
3417 MachineBasicBlock *
MBB =
MI->getParent();
3430 const int MaxPriority = 3;
3431 const int NormalPriority = 2;
3432 const int PostExportPriority = 0;
3434 auto It =
MI->getIterator();
3435 switch (
MI->getOpcode()) {
3436 case AMDGPU::S_ENDPGM:
3437 case AMDGPU::S_ENDPGM_SAVED:
3438 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3439 case AMDGPU::SI_RETURN_TO_EPILOG:
3442 if (MF->getFrameInfo().hasCalls())
3445 case AMDGPU::S_SETPRIO: {
3447 auto &PrioOp =
MI->getOperand(0);
3448 int Prio = PrioOp.getImm();
3449 bool InWA = (Prio == PostExportPriority) &&
3450 (It !=
MBB->
begin() && TII.isEXP(*std::prev(It)));
3451 if (InWA || Prio >= NormalPriority)
3453 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3457 if (!TII.isEXP(*
MI))
3468 auto NextMI = std::next(It);
3469 bool EndOfShader =
false;
3470 if (NextMI !=
MBB->
end()) {
3472 if (TII.isEXP(*NextMI))
3475 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3476 NextMI->getOperand(0).getImm() == PostExportPriority)
3478 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3485 .
addImm(PostExportPriority);
3489 BuildMI(*
MBB, NextMI,
DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3490 .
addReg(AMDGPU::SGPR_NULL)
3510 const SIInstrInfo *TII = ST.getInstrInfo();
3522 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3527bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(
MachineInstr *
MI) {
3528 if (
MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3531 const SIInstrInfo *TII = ST.getInstrInfo();
3533 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3535 BuildMI(*
MI->getParent(), std::next(
MI->getIterator()),
MI->getDebugLoc(),
3536 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3542bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(
MachineInstr *
MI) {
3545 if (!IsHazardRecognizerMode)
3548 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3549 const SIInstrInfo *TII = ST.getInstrInfo();
3551 const int FlatScrBaseWaitStates = 10;
3553 bool ReadsFlatScrLo =
3554 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3555 bool ReadsFlatScrHi =
3556 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3562 ReadsFlatScrLo =
true;
3565 ReadsFlatScrHi =
true;
3570 const MachineRegisterInfo &
MRI = MF.getRegInfo();
3573 DenseSet<const MachineBasicBlock *> Visited;
3575 return MI.modifiesRegister(
Reg, TRI);
3580 auto IsSGPRDef = [TII, TRI, &
MRI](
const MachineInstr &
MI) ->
unsigned {
3581 if (!TII->isSALU(
MI) && !TII->isVALU(
MI))
3583 for (
const MachineOperand &MO :
MI.all_defs()) {
3584 if (TRI->isSGPRReg(
MRI, MO.getReg()))
3590 auto IsExpiredFn = [=](
const MachineInstr &
MI,
int SgprWrites) {
3591 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3592 unsigned Wait =
MI.getOperand(0).getImm();
3597 return SgprWrites >= FlatScrBaseWaitStates;
3600 return ::getWaitStatesSince(
3601 IsHazardFn,
MI->getParent(), std::next(
MI->getReverseIterator()),
3602 0,
IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3605 if ((!ReadsFlatScrLo ||
MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
3606 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3607 (!ReadsFlatScrHi ||
MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
3608 !IsRegDefHazard(AMDGPU::SGPR103)))
3612 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3623 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3624 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, unsigned Latency, unsigned Category)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(DataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static const uint32_t IV[8]
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Implements a dense probed hash-table based set.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
unsigned PreEmitNoopsCommon(MachineInstr *)
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
ArrayRef< MCOperandInfo > operands() const
Wrapper class representing physical registers. Should be passed by value.
Instructions::const_reverse_iterator const_reverse_instr_iterator
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
unsigned getOccupancy() const
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned getMaxLookAhead() const
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
NodeAddr< UseNode * > Use
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionAddr VTableAddr Count
unsigned MCRegUnit
Register units are used to compute register aliasing.
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...