26struct MFMAPaddingRatioParser :
public cl::parser<unsigned> {
31 return O.error(
"'" + Arg +
"' value invalid for uint argument!");
34 return O.error(
"'" + Arg +
"' value must be in the range [0, 100]!");
44 cl::desc(
"Fill a percentage of the latency between "
45 "neighboring MFMA with s_nops."));
50 cl::desc(
"Insert a s_nop x before every instruction"));
60 : IsHazardRecognizerMode(
false), CurrCycleInstr(nullptr), MF(MF),
62 TRI(
TII.getRegisterInfo()), TSchedModel(
TII.getSchedModel()),
63 ClauseUses(
TRI.getNumRegUnits()), ClauseDefs(
TRI.getNumRegUnits()) {
69 EmittedInstrs.clear();
81 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
85 return Opcode == AMDGPU::S_GETREG_B32;
90 case AMDGPU::S_SETREG_B32:
91 case AMDGPU::S_SETREG_B32_mode:
92 case AMDGPU::S_SETREG_IMM32_B32:
93 case AMDGPU::S_SETREG_IMM32_B32_mode:
100 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
104 return Opcode == AMDGPU::S_RFE_B64;
109 case AMDGPU::S_MOVRELS_B32:
110 case AMDGPU::S_MOVRELS_B64:
111 case AMDGPU::S_MOVRELD_B32:
112 case AMDGPU::S_MOVRELD_B64:
121 if (
TII.isAlwaysGDS(
MI.getOpcode()))
124 switch (
MI.getOpcode()) {
125 case AMDGPU::S_SENDMSG:
126 case AMDGPU::S_SENDMSGHALT:
127 case AMDGPU::S_TTRACEDATA:
131 case AMDGPU::DS_PERMUTE_B32:
132 case AMDGPU::DS_BPERMUTE_B32:
135 if (
TII.isDS(
MI.getOpcode())) {
136 int GDS = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
137 AMDGPU::OpName::gds);
138 if (
MI.getOperand(GDS).getImm())
146 unsigned Opcode =
MI.getOpcode();
147 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
148 Opcode == AMDGPU::V_PERMLANE64_B32 ||
149 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
150 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
151 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
152 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
153 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
154 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
155 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
156 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
157 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
158 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
159 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
160 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
170 AMDGPU::OpName::simm16);
190 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
205 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
208 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
213 checkMAIVALUHazards(
MI) > 0)
216 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
219 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
222 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
227 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
228 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
232 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr))) &&
233 checkReadM0Hazards(
MI) > 0)
240 checkMAILdStHazards(
MI) > 0)
243 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
251 while (Quantity > 0) {
252 unsigned Arg = std::min(Quantity, 8u);
260GCNHazardRecognizer::getMFMAPipelineWaitStates(
const MachineInstr &
MI)
const {
267void GCNHazardRecognizer::processBundle() {
271 for (;
MI != E &&
MI->isInsideBundle(); ++
MI) {
272 CurrCycleInstr = &*
MI;
275 if (IsHazardRecognizerMode) {
276 fixHazards(CurrCycleInstr);
284 for (
unsigned i = 0, e = std::min(WaitStates,
MaxLookAhead - 1); i <
e; ++i)
285 EmittedInstrs.push_front(
nullptr);
287 EmittedInstrs.push_front(CurrCycleInstr);
290 CurrCycleInstr =
nullptr;
294 assert(IsHazardRecognizerMode);
298 if (
MI->isInsideBundle())
308 IsHazardRecognizerMode =
true;
312 CurrCycleInstr =
nullptr;
323 return std::max(WaitStates, checkSMRDHazards(
MI));
326 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
328 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
334 WaitStates = std::max(WaitStates, checkVMEMHazards(
MI));
337 WaitStates = std::max(WaitStates, checkVALUHazards(
MI));
340 WaitStates = std::max(WaitStates, checkDPPHazards(
MI));
343 WaitStates = std::max(WaitStates, checkDivFMasHazards(
MI));
346 WaitStates = std::max(WaitStates, checkRWLaneHazards(
MI));
350 checkMAIVALUHazards(
MI) > 0)
351 WaitStates = std::max(WaitStates, checkMAIVALUHazards(
MI));
353 if (
MI->isInlineAsm())
354 return std::max(WaitStates, checkInlineAsmHazards(
MI));
357 return std::max(WaitStates, checkGetRegHazards(
MI));
360 return std::max(WaitStates, checkSetRegHazards(
MI));
363 return std::max(WaitStates, checkRFEHazards(
MI));
367 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
368 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
372 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr)))
373 return std::max(WaitStates, checkReadM0Hazards(
MI));
376 return std::max(WaitStates, checkMAIHazards(
MI));
379 return std::max(WaitStates, checkMAILdStHazards(
MI));
382 return std::max(WaitStates, checkPermlaneHazards(
MI));
388 EmittedInstrs.push_front(
nullptr);
394 if (!CurrCycleInstr) {
395 EmittedInstrs.push_front(
nullptr);
405 if (!NumWaitStates) {
406 CurrCycleInstr =
nullptr;
411 EmittedInstrs.push_front(CurrCycleInstr);
418 EmittedInstrs.push_front(
nullptr);
426 CurrCycleInstr =
nullptr;
430 assert(!IsHazardRecognizerMode &&
431 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
444template <
typename StateT>
457 switch (IsHazard(State, *
I)) {
467 if (
I->isInlineAsm() ||
I->isMetaInstruction())
470 UpdateState(State, *
I);
474 if (!Visited.
insert(Pred).second)
477 if (
hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
501 if (
I->isInlineAsm())
504 WaitStates += GetNumWaitStates(*
I);
506 if (IsExpired(*
I, WaitStates))
507 return std::numeric_limits<int>::max();
510 int MinWaitStates = std::numeric_limits<int>::max();
512 if (!Visited.
insert(Pred).second)
516 IsExpired, Visited, GetNumWaitStates);
518 MinWaitStates = std::min(MinWaitStates, W);
521 return MinWaitStates;
528 std::next(
MI->getReverseIterator()), 0, IsExpired,
532int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
int Limit) {
533 if (IsHazardRecognizerMode) {
535 return WaitStates >= Limit;
537 return ::getWaitStatesSince(IsHazard, CurrCycleInstr,
IsExpiredFn);
546 if (
MI->isInlineAsm())
551 if (WaitStates >= Limit)
554 return std::numeric_limits<int>::max();
557int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
558 IsHazardFn IsHazardDef,
563 return IsHazardDef(
MI) &&
MI.modifiesRegister(Reg, TRI);
569int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
609int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM) {
615 bool IsSMRD = TII.
isSMRD(*MEM);
641 if (ClauseDefs.
none())
654 return ClauseDefs.
anyCommon(ClauseUses) ? 1 : 0;
657int GCNHazardRecognizer::checkSMRDHazards(
MachineInstr *SMRD) {
658 int WaitStatesNeeded = 0;
660 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
664 return WaitStatesNeeded;
668 int SmrdSgprWaitStates = 4;
681 int WaitStatesNeededForUse =
682 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
684 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
694 int WaitStatesNeededForUse =
695 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
698 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
702 return WaitStatesNeeded;
705int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr* VMEM) {
709 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
713 const int VmemSgprWaitStates = 5;
721 int WaitStatesNeededForUse =
722 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
724 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
726 return WaitStatesNeeded;
734 int DppVgprWaitStates = 2;
735 int DppExecWaitStates = 5;
736 int WaitStatesNeeded = 0;
738 return TII->isVALU(
MI);
744 int WaitStatesNeededForUse =
745 DppVgprWaitStates - getWaitStatesSinceDef(
749 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
752 WaitStatesNeeded = std::max(
754 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
757 return WaitStatesNeeded;
760int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas) {
765 const int DivFMasWaitStates = 4;
767 return TII->isVALU(
MI);
769 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
772 return DivFMasWaitStates - WaitStatesNeeded;
775int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr) {
777 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
779 const int GetRegWaitStates = 2;
783 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
785 return GetRegWaitStates - WaitStatesNeeded;
788int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr) {
790 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
796 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
797 return SetRegWaitStates - WaitStatesNeeded;
800int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI) {
805 unsigned Opcode =
MI.getOpcode();
808 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
811 VDataRCID =
Desc.operands()[VDataIdx].RegClass;
821 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
825 (!SOffset || !SOffset->
isReg()))
833 if (
TII->isMIMG(
MI)) {
834 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
840 if (
TII->isFLAT(
MI)) {
853GCNHazardRecognizer::checkVALUHazardsHelper(
const MachineOperand &Def,
860 int WaitStatesNeeded = 0;
862 if (!
TRI->isVectorRegister(
MRI,
Def.getReg()))
863 return WaitStatesNeeded;
866 int DataIdx = createsVALUHazard(
MI);
867 return DataIdx >= 0 &&
868 TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(), Reg);
871 int WaitStatesNeededForDef =
872 VALUWaitStates - getWaitStatesSince(
IsHazardFn, VALUWaitStates);
873 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
875 return WaitStatesNeeded;
891 unsigned Opcode =
MI.getOpcode();
901 if (
auto *DstSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel))
903 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
909 if (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src0_modifiers) &
911 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
915 (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src2_modifiers) &
917 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
923 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
944 for (
auto &Operand : VALU->operands()) {
945 if (Operand.isReg() &&
TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
952int GCNHazardRecognizer::checkVALUHazards(
MachineInstr *VALU) {
953 int WaitStatesNeeded = 0;
956 const int TransDefWaitstates = 1;
966 if (
Use.isReg() &&
TRI->regsOverlap(Def,
Use.getReg()))
973 int WaitStatesNeededForDef =
975 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
976 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
980 const int Shift16DefWaitstates = 1;
990 if (ProducerMI.isInlineAsm()) {
992 for (
auto &Def : ProducerMI.all_defs()) {
1001 int WaitStatesNeededForDef =
1002 Shift16DefWaitstates -
1003 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1004 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1008 const int VALUWriteSGPRVALUReadWaitstates = 2;
1009 const int VALUWriteEXECRWLane = 4;
1010 const int VALUWriteVGPRReadlaneRead = 1;
1018 return MI.modifiesRegister(
UseReg, TRI);
1027 int WaitStatesNeededForDef =
1028 VALUWriteSGPRVALUReadWaitstates -
1029 getWaitStatesSince(IsVALUDefSGPRFn,
1030 VALUWriteSGPRVALUReadWaitstates);
1031 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1035 if (
VALU->readsRegister(AMDGPU::VCC, TRI)) {
1037 int WaitStatesNeededForDef =
1038 VALUWriteSGPRVALUReadWaitstates -
1039 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1040 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1043 switch (
VALU->getOpcode()) {
1044 case AMDGPU::V_READLANE_B32:
1045 case AMDGPU::V_READFIRSTLANE_B32: {
1048 int WaitStatesNeededForDef =
1049 VALUWriteVGPRReadlaneRead -
1050 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1051 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1054 case AMDGPU::V_WRITELANE_B32: {
1056 int WaitStatesNeededForDef =
1057 VALUWriteEXECRWLane -
1058 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1059 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1070 return WaitStatesNeeded;
1075 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def,
MRI));
1078 return WaitStatesNeeded;
1081int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA) {
1095 int WaitStatesNeeded = 0;
1099 if (
Op.isReg() &&
Op.isDef()) {
1100 if (!
TRI.isVectorRegister(
MRI,
Op.getReg()))
1105 std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Op,
MRI));
1111 const int Shift16DefWaitstates = 1;
1113 auto IsShift16BitDefFn = [
this, &
IA](
const MachineInstr &ProducerMI) {
1117 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1118 IA->readsRegister(Dst->getReg(), &TRI);
1120 if (ProducerMI.isInlineAsm()) {
1122 for (
auto &Def : ProducerMI.all_defs()) {
1123 if (
IA->modifiesRegister(
Def.getReg(), &TRI) ||
1124 IA->readsRegister(
Def.getReg(), &TRI)) {
1133 int WaitStatesNeededForDef =
1134 Shift16DefWaitstates -
1135 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1136 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1139 return WaitStatesNeeded;
1142int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane) {
1148 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1156 const int RWLaneWaitStates = 4;
1157 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
1159 return RWLaneWaitStates - WaitStatesSince;
1162int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE) {
1168 const int RFEWaitStates = 1;
1173 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
1174 return RFEWaitStates - WaitStatesNeeded;
1179 const int ReadM0WaitStates = 1;
1181 return ReadM0WaitStates -
1182 getWaitStatesSinceDef(AMDGPU::M0,
IsHazardFn, ReadM0WaitStates);
1186 fixVMEMtoScalarWriteHazards(
MI);
1187 fixVcmpxPermlaneHazards(
MI);
1188 fixSMEMtoVectorWriteHazards(
MI);
1189 fixVcmpxExecWARHazard(
MI);
1190 fixLdsBranchVmemWARHazard(
MI);
1192 fixLdsDirectVALUHazard(
MI);
1193 fixLdsDirectVMEMHazard(
MI);
1195 fixVALUPartialForwardingHazard(
MI);
1196 fixVALUTransUseHazard(
MI);
1197 fixVALUTransCoexecutionHazards(
MI);
1199 fixWMMACoexecutionHazards(
MI);
1200 fixShift64HighRegBug(
MI);
1201 fixVALUMaskWriteHazard(
MI);
1202 fixRequiredExportPriority(
MI);
1204 fixGetRegWaitIdle(
MI);
1206 fixDsAtomicAsyncBarrierArriveB64(
MI);
1208 fixScratchBaseForwardingHazard(
MI);
1215 return (
TII.isVOPC(
MI) ||
1216 (
MI.isCompare() && (
TII.isVOP3(
MI) ||
TII.isSDWA(
MI)))) &&
1217 MI.modifiesRegister(AMDGPU::EXEC, &
TRI);
1220bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
1231 unsigned Opc =
MI.getOpcode();
1233 Opc != AMDGPU::V_NOP_e64 &&
Opc != AMDGPU::V_NOP_sdwa;
1237 std::numeric_limits<int>::max())
1243 auto *Src0 =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
1245 bool IsUndef = Src0->isUndef();
1247 TII->get(AMDGPU::V_MOV_B32_e32))
1254bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
1262 if (
MI->getNumDefs() == 0)
1273 I.findRegisterUseOperand(
Def.getReg(),
TRI,
false);
1283 (
MI.getOpcode() == AMDGPU::S_WAITCNT &&
1284 !
MI.getOperand(0).getImm()) ||
1285 (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1290 std::numeric_limits<int>::max())
1295 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1300bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
1308 AMDGPU::OpName SDSTName;
1309 switch (
MI->getOpcode()) {
1310 case AMDGPU::V_READLANE_B32:
1311 case AMDGPU::V_READFIRSTLANE_B32:
1312 SDSTName = AMDGPU::OpName::vdst;
1315 SDSTName = AMDGPU::OpName::sdst;
1324 for (
const auto &MO :
MI->implicit_operands()) {
1325 if (MO.isDef() &&
TRI->isSGPRClass(
TRI->getPhysRegBaseClass(MO.getReg()))) {
1341 if (
TII->isSALU(
MI)) {
1342 switch (
MI.getOpcode()) {
1343 case AMDGPU::S_SETVSKIP:
1344 case AMDGPU::S_VERSION:
1345 case AMDGPU::S_WAITCNT_VSCNT:
1346 case AMDGPU::S_WAITCNT_VMCNT:
1347 case AMDGPU::S_WAITCNT_EXPCNT:
1350 case AMDGPU::S_WAITCNT_LGKMCNT:
1352 return (
MI.getOperand(1).getImm() == 0) &&
1353 (
MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1354 case AMDGPU::S_WAITCNT: {
1355 const int64_t
Imm =
MI.getOperand(0).getImm();
1358 return (Decoded.
DsCnt == 0);
1362 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1363 "unexpected wait count instruction");
1365 if (
TII->isSOPP(
MI))
1381 std::numeric_limits<int>::max())
1385 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1390bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1399 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1405 return I.readsRegister(AMDGPU::EXEC, TRI);
1411 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))
1413 for (
auto MO :
MI.implicit_operands())
1414 if (MO.isDef() &&
TRI->isSGPRClass(
TRI->getPhysRegBaseClass(MO.getReg())))
1417 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1424 std::numeric_limits<int>::max())
1428 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1435 if (!ST.hasLdsBranchVmemWARHazard())
1440 bool HasLds =
false;
1441 bool HasVmem =
false;
1442 for (
auto &
MBB : MF) {
1443 for (
auto &
MI :
MBB) {
1447 if (HasLds && HasVmem)
1455 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1456 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1457 !
I.getOperand(1).getImm();
1460bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1461 if (!RunLdsBranchVmemWARHazardFixup)
1476 auto InstType = IsHazardInst(*
MI);
1489 auto InstType2 = IsHazardInst(
I);
1490 return InstType2 && InstType != InstType2;
1494 auto InstType2 = IsHazardInst(
I);
1495 if (InstType == InstType2)
1502 std::numeric_limits<int>::max();
1506 std::numeric_limits<int>::max())
1511 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1518bool GCNHazardRecognizer::fixLdsDirectVALUHazard(
MachineInstr *
MI) {
1522 const int NoHazardWaitStates = 15;
1526 bool VisitedTrans =
false;
1532 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1535 if (WaitStates >= NoHazardWaitStates)
1546 auto Count = ::getWaitStatesSince(
IsHazardFn,
MI->getParent(),
1547 std::next(
MI->getReverseIterator()), 0,
1556 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvdst);
1557 WaitVdstOp->
setImm(std::min(Count, NoHazardWaitStates));
1562bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(
MachineInstr *
MI) {
1572 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1579 (
I.getOpcode() == AMDGPU::S_WAITCNT && !
I.getOperand(0).getImm()) ||
1580 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1583 !
TII.getNamedOperand(
I, AMDGPU::OpName::waitvsrc)->getImm());
1587 std::numeric_limits<int>::max())
1590 if (LdsdirCanWait) {
1591 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1594 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1601bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(
MachineInstr *
MI) {
1617 if (SrcVGPRs.
size() <= 1)
1635 const int Intv1plus2MaxVALUs = 2;
1636 const int Intv3MaxVALUs = 4;
1637 const int IntvMaxVALUs = 6;
1638 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1642 int ExecPos = std::numeric_limits<int>::max();
1651 if (State.VALUs > NoHazardVALUWaitStates)
1652 return HazardExpired;
1657 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1659 return HazardExpired;
1662 bool Changed =
false;
1665 if (!State.DefPos.count(Src) &&
I.modifiesRegister(Src, &TRI)) {
1666 State.DefPos[Src] = State.VALUs;
1671 if (State.ExecPos == std::numeric_limits<int>::max()) {
1672 if (!State.DefPos.empty() &&
I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1673 State.ExecPos = State.VALUs;
1680 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1681 return HazardExpired;
1685 return NoHazardFound;
1688 if (State.ExecPos == std::numeric_limits<int>::max())
1689 return NoHazardFound;
1691 int PreExecPos = std::numeric_limits<int>::max();
1692 int PostExecPos = std::numeric_limits<int>::max();
1694 for (
auto Entry : State.DefPos) {
1695 int DefVALUs =
Entry.second;
1696 if (DefVALUs != std::numeric_limits<int>::max()) {
1697 if (DefVALUs >= State.ExecPos)
1698 PreExecPos = std::min(PreExecPos, DefVALUs);
1700 PostExecPos = std::min(PostExecPos, DefVALUs);
1705 if (PostExecPos == std::numeric_limits<int>::max())
1706 return NoHazardFound;
1709 int Intv3VALUs = PostExecPos;
1710 if (Intv3VALUs > Intv3MaxVALUs)
1711 return HazardExpired;
1714 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1715 if (Intv2VALUs > Intv1plus2MaxVALUs)
1716 return HazardExpired;
1719 if (PreExecPos == std::numeric_limits<int>::max())
1720 return NoHazardFound;
1723 int Intv1VALUs = PreExecPos - State.ExecPos;
1724 if (Intv1VALUs > Intv1plus2MaxVALUs)
1725 return HazardExpired;
1728 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1729 return HazardExpired;
1733 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1739 if (!hasHazard<StateType>(State,
IsHazardFn, UpdateStateFn,
MI->getParent(),
1740 std::next(
MI->getReverseIterator()), Visited))
1744 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1750bool GCNHazardRecognizer::fixVALUTransUseHazard(
MachineInstr *
MI) {
1775 const int IntvMaxVALUs = 5;
1776 const int IntvMaxTRANS = 1;
1788 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1789 return HazardExpired;
1794 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1796 return HazardExpired;
1801 if (
I.modifiesRegister(Src, &TRI)) {
1807 return NoHazardFound;
1809 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1817 if (!hasHazard<StateType>(State,
IsHazardFn, UpdateStateFn,
MI->getParent(),
1818 std::next(
MI->getReverseIterator()), Visited))
1824 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1830bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(
MachineInstr *
MI) {
1843 Register TransDef =
TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1845 if (ValuUse.isReg() &&
TRI->regsOverlap(TransDef, ValuUse.getReg()))
1849 auto *ValuDst =
TII->getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1850 if (!ValuDst || !ValuDst->isReg())
1854 Register ValuDef = ValuDst->getReg();
1856 if (TransUse.isReg() &&
TRI->regsOverlap(ValuDef, TransUse.getReg()))
1867 const int HasVALU = std::numeric_limits<int>::max();
1868 if (::getWaitStatesSince(IsTransHazardFn,
MI,
IsExpiredFn) == HasVALU)
1871 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(),
TII->get(AMDGPU::V_NOP_e32));
1889 TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
1891 TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
1894 TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1896 if (
TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1897 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1906 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
1907 if (
TRI->regsOverlap(PrevDstReg, CurIndex))
1921 std::numeric_limits<int>::max())
1924 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(),
TII->get(AMDGPU::V_NOP_e32));
1936 unsigned Category) {
1938 "Handle me if the xdl wmma instruction latency changes");
1975bool GCNHazardRecognizer::fixWMMACoexecutionHazards(
MachineInstr *
MI) {
1990 const int WMMAWaitStates[] = {5, 9, 3, 5};
1991 const int VALUWaitStates[] = {4, 8, 2, 4};
1992 unsigned Category = 0;
1995 if (!
TII->isXDLWMMA(
I))
1998 unsigned Latency = TSchedModel.computeInstrLatency(&
I);
2002 Register D0 =
TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2003 Register A1 =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
2004 Register B1 =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
2007 if (
TRI->regsOverlap(D0, A1) ||
TRI->regsOverlap(D0, B1))
2011 Register Idx1 =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
2012 if (
TRI->regsOverlap(D0, Idx1))
2020 if (!
TII->isXDLWMMA(
I))
2023 unsigned Latency = TSchedModel.computeInstrLatency(&
I);
2028 Register D0 =
TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2030 if (ValuUse.isReg() &&
TRI->regsOverlap(D0, ValuUse.getReg()))
2034 auto *ValuDst =
TII->getNamedOperand(*
MI, AMDGPU::OpName::vdst);
2035 if (!ValuDst || !ValuDst->isReg())
2040 if (
TRI->regsOverlap(D0, D1))
2044 Register A0 =
TII->getNamedOperand(
I, AMDGPU::OpName::src0)->getReg();
2045 Register B0 =
TII->getNamedOperand(
I, AMDGPU::OpName::src1)->getReg();
2046 if (
TRI->regsOverlap(A0, D1) ||
TRI->regsOverlap(B0, D1))
2050 Register Idx0 =
TII->getNamedOperand(
I, AMDGPU::OpName::src2)->getReg();
2051 if (
TRI->regsOverlap(D1, Idx0))
2060 return WaitStates >= Limit;
2067 int WaitStatesNeeded = -1;
2068 if (
TII->isXDLWMMA(*
MI)) {
2069 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2070 Limit = WMMAWaitStates[Category];
2077 Limit - ::getWaitStatesSince(IsWMMAHazardFn,
MI->getParent(),
2078 std::next(
MI->getReverseIterator()), 0,
2082 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2083 Limit = VALUWaitStates[Category];
2090 Limit - ::getWaitStatesSince(IsVALUHazardFn,
MI->getParent(),
2091 std::next(
MI->getReverseIterator()), 0,
2098 for (
int i = 0; i < WaitStatesNeeded; i++)
2100 TII->get(AMDGPU::V_NOP_e32));
2105bool GCNHazardRecognizer::fixShift64HighRegBug(
MachineInstr *
MI) {
2110 switch (
MI->getOpcode()) {
2113 case AMDGPU::V_LSHLREV_B64_e64:
2114 case AMDGPU::V_LSHRREV_B64_e64:
2115 case AMDGPU::V_ASHRREV_I64_e64:
2126 if (!
TRI.isVGPR(
MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2129 if (AmtReg != AMDGPU::VGPR255 &&
MRI.isPhysRegUsed(AmtReg + 1))
2133 bool OverlappedSrc = Src1->
isReg() &&
TRI.regsOverlap(Src1->
getReg(), AmtReg);
2134 bool OverlappedDst =
MI->modifiesRegister(AmtReg, &TRI);
2135 bool Overlapped = OverlappedSrc || OverlappedDst;
2137 assert(!OverlappedDst || !OverlappedSrc ||
2138 Src1->
getReg() ==
MI->getOperand(0).getReg());
2140 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2143 for (
MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2144 : AMDGPU::VGPR_32RegClass) {
2145 if (!
MI->modifiesRegister(Reg, &TRI) && !
MI->readsRegister(Reg, &TRI)) {
2156 NewAmtLo =
TRI.getSubReg(NewReg, AMDGPU::sub0);
2198 MI->getOperand(0).setReg(NewReg);
2199 if (OverlappedSrc) {
2209 int NSAtoVMEMWaitStates = 1;
2218 const auto *
Offset =
TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
2226 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2227 TII->getInstSizeInBytes(
I) >= 16;
2230 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
2233int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
MachineInstr *
MI) {
2234 int FPAtomicToDenormModeWaitStates = 3;
2240 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2256 return FPAtomicToDenormModeWaitStates -
2275 int NeighborMFMALatency = 0;
2276 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2281 NeighborMFMALatency = this->getMFMAPipelineWaitStates(
MI);
2285 const int MaxMFMAPipelineWaitStates = 16;
2286 int WaitStatesSinceNeighborMFMA =
2287 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2289 int NeighborMFMAPaddingNeeded =
2291 WaitStatesSinceNeighborMFMA;
2293 return std::max(0, NeighborMFMAPaddingNeeded);
2297 int WaitStatesNeeded = 0;
2298 unsigned Opc =
MI->getOpcode();
2304 if (
Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
2305 const int LegacyVALUWritesVGPRWaitStates = 2;
2306 const int VALUWritesExecWaitStates = 4;
2307 const int MaxWaitStates = 4;
2309 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2310 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2311 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2313 if (WaitStatesNeeded < MaxWaitStates) {
2315 const int MaxWaitStates = 2;
2320 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2321 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
2322 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2324 if (WaitStatesNeeded == MaxWaitStates)
2334 if (
Op.isDef() &&
Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2337 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2338 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2339 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2340 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2341 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2342 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2343 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2344 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2345 const int MaxWaitStates = 18;
2347 unsigned HazardDefLatency = 0;
2349 auto IsOverlappedMFMAFn = [
Reg, &HazardDefLatency,
2357 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2358 return TRI.regsOverlap(DstReg, Reg);
2361 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2363 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2364 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2365 int OpNo =
Op.getOperandNo();
2366 if (OpNo == SrcCIdx) {
2367 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2368 }
else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2369 switch (HazardDefLatency) {
2370 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2372 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2374 case 16: [[fallthrough]];
2375 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2378 }
else if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2379 switch (HazardDefLatency) {
2380 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2382 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2384 case 16: [[fallthrough]];
2385 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2390 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2391 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2393 if (WaitStatesNeeded == MaxWaitStates)
2394 return WaitStatesNeeded;
2397 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2400 return TRI.regsOverlap(Reg, DstReg);
2403 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2404 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2405 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2406 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2407 if (OpNo == SrcCIdx)
2408 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2409 else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2410 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2412 WaitStatesNeededForUse = NeedWaitStates -
2413 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2414 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2416 if (WaitStatesNeeded == MaxWaitStates)
2417 return WaitStatesNeeded;
2420 if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2421 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2422 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2423 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2424 const int MaxWaitStates = 13;
2425 Register DstReg =
MI->getOperand(0).getReg();
2426 unsigned HazardDefLatency = 0;
2428 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2434 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2435 return TRI.regsOverlap(Reg, DstReg);
2438 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2440 switch (HazardDefLatency) {
2441 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2443 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2445 case 16: [[fallthrough]];
2446 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2450 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2451 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2455 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2457 return WaitStatesNeeded;
2468 return NumPasses + 1 + IsGFX950;
2479 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2497 return NumPasses + 2;
2507 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2511 int WaitStatesNeeded = 0;
2512 unsigned Opc =
MI->getOpcode();
2524 return WaitStatesNeeded;
2526 const int VALUWritesExecWaitStates = 4;
2527 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2528 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2529 VALUWritesExecWaitStates);
2530 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2532 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2536 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2537 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2538 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2539 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2540 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2541 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2542 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2543 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2544 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2545 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2546 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2547 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2548 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2549 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2550 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2551 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2552 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2553 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2554 const int MaxWaitStates = 19;
2562 auto IsOverlappedMFMAFn = [
Reg, &FullReg, &MI1,
2567 FullReg = (DstReg ==
Reg);
2569 return TRI.regsOverlap(DstReg, Reg);
2572 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2573 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2574 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2577 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2578 if (NumWaitStates == std::numeric_limits<int>::max())
2583 int NeedWaitStates = 0;
2584 if (OpNo == SrcCIdx) {
2588 }
else if (FullReg) {
2589 if ((
Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2590 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2591 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2592 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2593 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2595 TSchedModel.computeInstrLatency(MI1) == 2)
2596 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2599 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2600 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2601 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2602 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2603 if (!
TII.isXDL(*
MI))
2606 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2607 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2609 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2610 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2611 if (!
TII.isXDL(*
MI))
2612 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2615 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2617 if (
TII.isXDL(*
MI) && !
TII.isXDL(*MI1))
2626 NumPasses, ST.hasGFX950Insts()))
2632 switch (NumPasses) {
2636 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2637 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2642 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2643 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2648 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2649 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2658 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2659 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2660 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2661 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2664 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2665 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2667 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2668 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2669 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2672 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2684 switch (NumPasses) {
2686 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2691 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2695 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2699 if (WaitStatesNeeded >= NeedWaitStates)
2702 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2703 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2705 if (WaitStatesNeeded == MaxWaitStates)
2710 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2712 return WaitStatesNeeded;
2720 int WaitStatesNeeded = 0;
2723 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2732 const int AccVgprReadLdStWaitStates = 2;
2733 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2734 const int MaxWaitStates = 2;
2736 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2737 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2738 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2740 if (WaitStatesNeeded == MaxWaitStates)
2741 return WaitStatesNeeded;
2744 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2745 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2750 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 ) <
2751 std::numeric_limits<int>::max();
2754 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2755 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2756 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2759 return WaitStatesNeeded;
2764 "this is a different vcmpx+permlane hazard");
2776 const int VCmpXWritesExecWaitStates = 4;
2777 const int VALUWritesVDstWaitStates = 2;
2778 int WaitStatesNeeded = 0;
2785 int WaitStatesSinceDef =
2786 VALUWritesVDstWaitStates -
2787 getWaitStatesSinceDef(Reg, IsVALUFn,
2788 VALUWritesVDstWaitStates);
2789 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2790 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2794 int VCmpXHazardWaits =
2795 VCmpXWritesExecWaitStates -
2796 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2798 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2799 return WaitStatesNeeded;
2807 return NumPasses + 2;
2817 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2827 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2835 return NumPasses + 2;
2852 int WaitStatesNeeded = 0;
2862 !
TRI.regsOverlap(
MI.getOperand(0).getReg(), Reg))
2871 !
TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
2877 bool DGEMMAfterVALUWrite =
false;
2878 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite,
this](
const MachineInstr &
MI) {
2881 DGEMMAfterVALUWrite =
true;
2885 if (!
TII.isVALU(
MI) || !DGEMMAfterVALUWrite)
2891 int SrcCIdx = AMDGPU::getNamedOperandIdx(
MI->getOpcode(),
2892 AMDGPU::OpName::src2);
2894 if (IsMemOrExport || IsVALU) {
2895 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2896 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2897 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2898 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2899 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2900 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2901 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2902 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2903 const int DotWriteSameDotReadSrcAB = 3;
2904 const int DotWriteDifferentVALURead = 3;
2905 const int DMFMABetweenVALUWriteVMEMRead = 2;
2906 const int MaxWaitStates = 19;
2914 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2917 int NeedWaitStates = 0;
2918 if (
DOT->getOpcode() ==
MI->getOpcode()) {
2919 if (&
Use - &
MI->getOperand(0) != SrcCIdx)
2920 NeedWaitStates = DotWriteSameDotReadSrcAB;
2922 NeedWaitStates = DotWriteDifferentVALURead;
2925 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2926 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2934 DGEMMAfterVALUWrite =
false;
2935 if (
TRI.isVectorRegister(
MRI, Reg)) {
2936 int WaitStatesNeededForUse =
2937 DMFMABetweenVALUWriteVMEMRead -
2938 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2939 DMFMABetweenVALUWriteVMEMRead);
2941 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2946 WaitStatesSinceDef =
2947 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2951 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
2952 int NumPasses = HazardDefLatency;
2953 int NeedWaitStates = MaxWaitStates;
2956 switch (HazardDefLatency) {
2958 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2959 : DMFMA4x4WriteVgprVALUReadWaitStates;
2965 ? DMFMA16x16WriteVgprMemExpReadWaitStates
2967 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
2968 : DMFMA16x16WriteVgprVALUReadWaitStates);
2981 switch (HazardDefLatency) {
2983 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2986 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2989 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2996 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2997 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2999 if (WaitStatesNeeded == MaxWaitStates)
3004 unsigned Opc =
MI->getOpcode();
3005 const int DMFMAToFMA64WaitStates = 2;
3006 if ((
Opc == AMDGPU::V_FMA_F64_e64 ||
3007 Opc == AMDGPU::V_FMAC_F64_e32 ||
Opc == AMDGPU::V_FMAC_F64_e64 ||
3008 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3009 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3010 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3011 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3012 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3015 if (!IsVALU && !IsMemOrExport)
3016 return WaitStatesNeeded;
3019 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3020 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3021 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3022 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3023 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3024 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3025 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3026 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3027 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3028 const int DotWriteDifferentVALUWrite = 3;
3029 const int MaxWaitStates = 19;
3030 const int MaxWarWaitStates = 15;
3035 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3037 if (DOT &&
DOT->getOpcode() !=
MI->getOpcode())
3038 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3039 WaitStatesSinceDef);
3042 WaitStatesSinceDef =
3043 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3045 int NeedWaitStates = MaxWaitStates;
3046 int NumPasses = TSchedModel.computeInstrLatency(
MFMA);
3049 switch (NumPasses) {
3051 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3055 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3067 switch (NumPasses) {
3069 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3072 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3075 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3082 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3083 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3085 if (WaitStatesNeeded == MaxWaitStates)
3091 !
MI.readsRegister(Reg, &TRI))
3098 TII.getNamedOperand(
MI, AMDGPU::OpName::src2);
3108 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3113 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3114 int NeedWaitStates = MaxWaitStates;
3115 switch (HazardDefLatency) {
3116 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3119 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3121 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3123 case 16: [[fallthrough]];
3124 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3128 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3129 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3132 return WaitStatesNeeded;
3145 return MAI !=
nullptr;
3149 if (IsMFMAFn(*
MI)) {
3150 int W = getWaitStatesSince(IsMFMAFn, 16);
3152 return W < (int)TSchedModel.computeInstrLatency(MAI);
3166 while (
I->isBundledWithPred())
3172 if (
I->getOpcode() != AMDGPU::S_GETPC_B64)
3176 const unsigned NewBytes = 4;
3178 "Unexpected instruction insertion in bundle");
3181 while (NextMI !=
End && NextMI->isBundledWithPred()) {
3182 for (
auto &Operand : NextMI->operands()) {
3183 if (Operand.isGlobal())
3184 Operand.setOffset(Operand.getOffset() + NewBytes);
3190bool GCNHazardRecognizer::fixVALUMaskWriteHazard(
MachineInstr *
MI) {
3207 if (!SDSTOp || !SDSTOp->
isReg())
3211 if (HazardReg == AMDGPU::EXEC ||
3212 HazardReg == AMDGPU::EXEC_LO ||
3213 HazardReg == AMDGPU::EXEC_HI ||
3214 HazardReg == AMDGPU::M0)
3218 switch (
I.getOpcode()) {
3219 case AMDGPU::V_ADDC_U32_e32:
3220 case AMDGPU::V_ADDC_U32_dpp:
3221 case AMDGPU::V_CNDMASK_B16_t16_e32:
3222 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3223 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3224 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3225 case AMDGPU::V_CNDMASK_B32_e32:
3226 case AMDGPU::V_CNDMASK_B32_dpp:
3227 case AMDGPU::V_DIV_FMAS_F32_e64:
3228 case AMDGPU::V_DIV_FMAS_F64_e64:
3229 case AMDGPU::V_SUBB_U32_e32:
3230 case AMDGPU::V_SUBB_U32_dpp:
3231 case AMDGPU::V_SUBBREV_U32_e32:
3232 case AMDGPU::V_SUBBREV_U32_dpp:
3234 return HazardReg == AMDGPU::VCC ||
3235 HazardReg == AMDGPU::VCC_LO ||
3236 HazardReg == AMDGPU::VCC_HI;
3237 case AMDGPU::V_ADDC_U32_e64:
3238 case AMDGPU::V_ADDC_U32_e64_dpp:
3239 case AMDGPU::V_CNDMASK_B16_t16_e64:
3240 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3241 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3242 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3243 case AMDGPU::V_CNDMASK_B32_e64:
3244 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3245 case AMDGPU::V_SUBB_U32_e64:
3246 case AMDGPU::V_SUBB_U32_e64_dpp:
3247 case AMDGPU::V_SUBBREV_U32_e64:
3248 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3252 return TRI.regsOverlap(SSRCOp->
getReg(), HazardReg);
3262 if (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3271 for (
int OpNo = 0,
End =
I.getNumOperands(); OpNo <
End; ++OpNo) {
3279 if (OpReg == AMDGPU::EXEC ||
3280 OpReg == AMDGPU::EXEC_LO ||
3281 OpReg == AMDGPU::EXEC_HI)
3284 if (
Op.isImplicit()) {
3285 if (OpReg == AMDGPU::VCC ||
3286 OpReg == AMDGPU::VCC_LO ||
3287 OpReg == AMDGPU::VCC_HI)
3291 if (
TRI.isSGPRReg(
MRI, OpReg))
3296 if (!
TII.isInlineConstant(
Op, OpInfo))
3305 std::numeric_limits<int>::max())
3308 auto NextMI = std::next(
MI->getIterator());
3311 auto NewMI =
BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
3312 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3324 if (EntryMBB.
begin() != EntryMBB.
end()) {
3325 auto &EntryMI = *EntryMBB.
begin();
3326 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3327 EntryMI.getOperand(0).getImm() >= Priority)
3336bool GCNHazardRecognizer::fixRequiredExportPriority(
MachineInstr *
MI) {
3355 const int MaxPriority = 3;
3356 const int NormalPriority = 2;
3357 const int PostExportPriority = 0;
3359 auto It =
MI->getIterator();
3360 switch (
MI->getOpcode()) {
3361 case AMDGPU::S_ENDPGM:
3362 case AMDGPU::S_ENDPGM_SAVED:
3363 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3364 case AMDGPU::SI_RETURN_TO_EPILOG:
3370 case AMDGPU::S_SETPRIO: {
3372 auto &PrioOp =
MI->getOperand(0);
3373 int Prio = PrioOp.getImm();
3374 bool InWA = (Prio == PostExportPriority) &&
3375 (It !=
MBB->
begin() &&
TII.isEXP(*std::prev(It)));
3376 if (InWA || Prio >= NormalPriority)
3378 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3382 if (!
TII.isEXP(*
MI))
3389 bool Changed =
false;
3393 auto NextMI = std::next(It);
3394 bool EndOfShader =
false;
3395 if (NextMI !=
MBB->
end()) {
3397 if (
TII.isEXP(*NextMI))
3400 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3401 NextMI->getOperand(0).getImm() == PostExportPriority)
3403 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3410 .
addImm(PostExportPriority);
3415 .
addReg(AMDGPU::SGPR_NULL)
3447 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3452bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(
MachineInstr *
MI) {
3453 if (
MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3458 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3460 BuildMI(*
MI->getParent(), std::next(
MI->getIterator()),
MI->getDebugLoc(),
3461 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3467bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(
MachineInstr *
MI) {
3470 if (!IsHazardRecognizerMode)
3476 const int FlatScrBaseWaitStates = 10;
3478 bool ReadsFlatScrLo =
3479 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3480 bool ReadsFlatScrHi =
3481 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3487 ReadsFlatScrLo =
true;
3490 ReadsFlatScrHi =
true;
3500 return MI.modifiesRegister(Reg, TRI);
3509 if (
TRI->isSGPRReg(
MRI, MO.getReg()))
3516 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3517 unsigned Wait =
MI.getOperand(0).getImm();
3522 return SgprWrites >= FlatScrBaseWaitStates;
3525 return ::getWaitStatesSince(
3526 IsHazardFn,
MI->getParent(), std::next(
MI->getReverseIterator()),
3527 0,
IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3530 if ((!ReadsFlatScrLo ||
MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
3531 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3532 (!ReadsFlatScrHi ||
MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
3533 !IsRegDefHazard(AMDGPU::SGPR103)))
3537 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3548 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(),
TII.get(AMDGPU::V_NOP_e32));
3549 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(),
TII.get(AMDGPU::V_NOP_e32));
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static bool hasHazard(StateT State, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, DenseSet< const MachineBasicBlock * > &Visited)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, unsigned Latency, unsigned Category)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
Register const TargetRegisterInfo * TRI
static const uint32_t IV[8]
bool anyCommon(const BitVector &RHS) const
Test if any common bits are set.
bool none() const
none - Returns true if none of the bits are set.
This class represents an Operation in the Expression.
Implements a dense probed hash-table based set.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
unsigned PreEmitNoopsCommon(MachineInstr *)
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
bool hasShift64HighRegBug() const
bool hasFPAtomicToDenormModeHazard() const
bool hasLdsBranchVmemWARHazard() const
bool hasGFX90AInsts() const
bool hasDstSelForwardingHazard() const
bool hasCvtScaleForwardingHazard() const
const SIInstrInfo * getInstrInfo() const override
bool hasVALUMaskWriteHazard() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const
bool hasVcmpxExecWARHazard() const
bool hasReadM0MovRelInterpHazard() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasRequiredExportPriority() const
bool hasLdsWaitVMSRC() const
bool hasExtendedWaitCounts() const
bool hasVcmpxPermlaneHazard() const
bool hasGFX950Insts() const
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
bool hasNoDataDepHazard() const
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
bool hasTransForwardingHazard() const
bool hasGFX940Insts() const
bool hasReadM0LdsDmaHazard() const
bool hasScratchBaseForwardingHazard() const
bool hasSMEMtoVectorWriteHazard() const
bool hasVMEMtoScalarWriteHazard() const
bool hasNSAtoVMEMBug() const
bool hasVDecCoExecHazard() const
bool hasReadM0SendMsgHazard() const
bool requiresWaitIdleBeforeGetReg() const
bool hasReadM0LdsDirectHazard() const
bool isXNACKEnabled() const
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
bool hasRFEHazards() const
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
bool setRegModeNeedsVNOPs() const
bool hasVALUTransUseHazard() const
bool hasLdsDirect() const
Describe properties that are true of each instruction in the target description file.
ArrayRef< MCOperandInfo > operands() const
This holds information about one operand of a machine instruction, indicating the register class for ...
Wrapper class representing physical registers. Should be passed by value.
Instructions::const_reverse_iterator const_reverse_instr_iterator
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< pred_iterator > predecessors()
bool hasCalls() const
Return true if the current function has any function calls.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Wrapper class representing virtual and physical registers.
static bool isMAI(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isVINTRP(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
unsigned getOccupancy() const
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned getMaxLookAhead() const
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
StringRef - Represent a constant reference to a string, i.e.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
ProcResIter getWriteProcResEnd(const MCSchedClassDesc *SC) const
LLVM_ABI const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
A Use represents the edge between a Value definition and its users.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
LLVM Value Representation.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.
Description of the encoding of one expression Op.
Summarize the scheduling resources required for an instruction of a particular scheduling class.
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...